combine 2 df

dim(test)
## [1] 1459   80
str(train)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 1460 obs. of  81 variables:
##  $ Id           : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ MSSubClass   : num  60 20 60 70 60 50 20 60 50 190 ...
##  $ MSZoning     : chr  "RL" "RL" "RL" "RL" ...
##  $ LotFrontage  : num  65 80 68 60 84 85 75 NA 51 50 ...
##  $ LotArea      : num  8450 9600 11250 9550 14260 ...
##  $ Street       : chr  "Pave" "Pave" "Pave" "Pave" ...
##  $ Alley        : chr  NA NA NA NA ...
##  $ LotShape     : chr  "Reg" "Reg" "IR1" "IR1" ...
##  $ LandContour  : chr  "Lvl" "Lvl" "Lvl" "Lvl" ...
##  $ Utilities    : chr  "AllPub" "AllPub" "AllPub" "AllPub" ...
##  $ LotConfig    : chr  "Inside" "FR2" "Inside" "Corner" ...
##  $ LandSlope    : chr  "Gtl" "Gtl" "Gtl" "Gtl" ...
##  $ Neighborhood : chr  "CollgCr" "Veenker" "CollgCr" "Crawfor" ...
##  $ Condition1   : chr  "Norm" "Feedr" "Norm" "Norm" ...
##  $ Condition2   : chr  "Norm" "Norm" "Norm" "Norm" ...
##  $ BldgType     : chr  "1Fam" "1Fam" "1Fam" "1Fam" ...
##  $ HouseStyle   : chr  "2Story" "1Story" "2Story" "2Story" ...
##  $ OverallQual  : num  7 6 7 7 8 5 8 7 7 5 ...
##  $ OverallCond  : num  5 8 5 5 5 5 5 6 5 6 ...
##  $ YearBuilt    : num  2003 1976 2001 1915 2000 ...
##  $ YearRemodAdd : num  2003 1976 2002 1970 2000 ...
##  $ RoofStyle    : chr  "Gable" "Gable" "Gable" "Gable" ...
##  $ RoofMatl     : chr  "CompShg" "CompShg" "CompShg" "CompShg" ...
##  $ Exterior1st  : chr  "VinylSd" "MetalSd" "VinylSd" "Wd Sdng" ...
##  $ Exterior2nd  : chr  "VinylSd" "MetalSd" "VinylSd" "Wd Shng" ...
##  $ MasVnrType   : chr  "BrkFace" "None" "BrkFace" "None" ...
##  $ MasVnrArea   : num  196 0 162 0 350 0 186 240 0 0 ...
##  $ ExterQual    : chr  "Gd" "TA" "Gd" "TA" ...
##  $ ExterCond    : chr  "TA" "TA" "TA" "TA" ...
##  $ Foundation   : chr  "PConc" "CBlock" "PConc" "BrkTil" ...
##  $ BsmtQual     : chr  "Gd" "Gd" "Gd" "TA" ...
##  $ BsmtCond     : chr  "TA" "TA" "TA" "Gd" ...
##  $ BsmtExposure : chr  "No" "Gd" "Mn" "No" ...
##  $ BsmtFinType1 : chr  "GLQ" "ALQ" "GLQ" "ALQ" ...
##  $ BsmtFinSF1   : num  706 978 486 216 655 ...
##  $ BsmtFinType2 : chr  "Unf" "Unf" "Unf" "Unf" ...
##  $ BsmtFinSF2   : num  0 0 0 0 0 0 0 32 0 0 ...
##  $ BsmtUnfSF    : num  150 284 434 540 490 64 317 216 952 140 ...
##  $ TotalBsmtSF  : num  856 1262 920 756 1145 ...
##  $ Heating      : chr  "GasA" "GasA" "GasA" "GasA" ...
##  $ HeatingQC    : chr  "Ex" "Ex" "Ex" "Gd" ...
##  $ CentralAir   : chr  "Y" "Y" "Y" "Y" ...
##  $ Electrical   : chr  "SBrkr" "SBrkr" "SBrkr" "SBrkr" ...
##  $ 1stFlrSF     : num  856 1262 920 961 1145 ...
##  $ 2ndFlrSF     : num  854 0 866 756 1053 ...
##  $ LowQualFinSF : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ GrLivArea    : num  1710 1262 1786 1717 2198 ...
##  $ BsmtFullBath : num  1 0 1 1 1 1 1 1 0 1 ...
##  $ BsmtHalfBath : num  0 1 0 0 0 0 0 0 0 0 ...
##  $ FullBath     : num  2 2 2 1 2 1 2 2 2 1 ...
##  $ HalfBath     : num  1 0 1 0 1 1 0 1 0 0 ...
##  $ BedroomAbvGr : num  3 3 3 3 4 1 3 3 2 2 ...
##  $ KitchenAbvGr : num  1 1 1 1 1 1 1 1 2 2 ...
##  $ KitchenQual  : chr  "Gd" "TA" "Gd" "Gd" ...
##  $ TotRmsAbvGrd : num  8 6 6 7 9 5 7 7 8 5 ...
##  $ Functional   : chr  "Typ" "Typ" "Typ" "Typ" ...
##  $ Fireplaces   : num  0 1 1 1 1 0 1 2 2 2 ...
##  $ FireplaceQu  : chr  NA "TA" "TA" "Gd" ...
##  $ GarageType   : chr  "Attchd" "Attchd" "Attchd" "Detchd" ...
##  $ GarageYrBlt  : num  2003 1976 2001 1998 2000 ...
##  $ GarageFinish : chr  "RFn" "RFn" "RFn" "Unf" ...
##  $ GarageCars   : num  2 2 2 3 3 2 2 2 2 1 ...
##  $ GarageArea   : num  548 460 608 642 836 480 636 484 468 205 ...
##  $ GarageQual   : chr  "TA" "TA" "TA" "TA" ...
##  $ GarageCond   : chr  "TA" "TA" "TA" "TA" ...
##  $ PavedDrive   : chr  "Y" "Y" "Y" "Y" ...
##  $ WoodDeckSF   : num  0 298 0 0 192 40 255 235 90 0 ...
##  $ OpenPorchSF  : num  61 0 42 35 84 30 57 204 0 4 ...
##  $ EnclosedPorch: num  0 0 0 272 0 0 0 228 205 0 ...
##  $ 3SsnPorch    : num  0 0 0 0 0 320 0 0 0 0 ...
##  $ ScreenPorch  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PoolArea     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PoolQC       : chr  NA NA NA NA ...
##  $ Fence        : chr  NA NA NA NA ...
##  $ MiscFeature  : chr  NA NA NA NA ...
##  $ MiscVal      : num  0 0 0 0 0 700 0 350 0 0 ...
##  $ MoSold       : num  2 5 9 2 12 10 8 11 4 1 ...
##  $ YrSold       : num  2008 2007 2008 2006 2008 ...
##  $ SaleType     : chr  "WD" "WD" "WD" "WD" ...
##  $ SaleCondition: chr  "Normal" "Normal" "Normal" "Abnorml" ...
##  $ SalePrice    : num  208500 181500 223500 140000 250000 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Id = col_double(),
##   ..   MSSubClass = col_double(),
##   ..   MSZoning = col_character(),
##   ..   LotFrontage = col_double(),
##   ..   LotArea = col_double(),
##   ..   Street = col_character(),
##   ..   Alley = col_character(),
##   ..   LotShape = col_character(),
##   ..   LandContour = col_character(),
##   ..   Utilities = col_character(),
##   ..   LotConfig = col_character(),
##   ..   LandSlope = col_character(),
##   ..   Neighborhood = col_character(),
##   ..   Condition1 = col_character(),
##   ..   Condition2 = col_character(),
##   ..   BldgType = col_character(),
##   ..   HouseStyle = col_character(),
##   ..   OverallQual = col_double(),
##   ..   OverallCond = col_double(),
##   ..   YearBuilt = col_double(),
##   ..   YearRemodAdd = col_double(),
##   ..   RoofStyle = col_character(),
##   ..   RoofMatl = col_character(),
##   ..   Exterior1st = col_character(),
##   ..   Exterior2nd = col_character(),
##   ..   MasVnrType = col_character(),
##   ..   MasVnrArea = col_double(),
##   ..   ExterQual = col_character(),
##   ..   ExterCond = col_character(),
##   ..   Foundation = col_character(),
##   ..   BsmtQual = col_character(),
##   ..   BsmtCond = col_character(),
##   ..   BsmtExposure = col_character(),
##   ..   BsmtFinType1 = col_character(),
##   ..   BsmtFinSF1 = col_double(),
##   ..   BsmtFinType2 = col_character(),
##   ..   BsmtFinSF2 = col_double(),
##   ..   BsmtUnfSF = col_double(),
##   ..   TotalBsmtSF = col_double(),
##   ..   Heating = col_character(),
##   ..   HeatingQC = col_character(),
##   ..   CentralAir = col_character(),
##   ..   Electrical = col_character(),
##   ..   `1stFlrSF` = col_double(),
##   ..   `2ndFlrSF` = col_double(),
##   ..   LowQualFinSF = col_double(),
##   ..   GrLivArea = col_double(),
##   ..   BsmtFullBath = col_double(),
##   ..   BsmtHalfBath = col_double(),
##   ..   FullBath = col_double(),
##   ..   HalfBath = col_double(),
##   ..   BedroomAbvGr = col_double(),
##   ..   KitchenAbvGr = col_double(),
##   ..   KitchenQual = col_character(),
##   ..   TotRmsAbvGrd = col_double(),
##   ..   Functional = col_character(),
##   ..   Fireplaces = col_double(),
##   ..   FireplaceQu = col_character(),
##   ..   GarageType = col_character(),
##   ..   GarageYrBlt = col_double(),
##   ..   GarageFinish = col_character(),
##   ..   GarageCars = col_double(),
##   ..   GarageArea = col_double(),
##   ..   GarageQual = col_character(),
##   ..   GarageCond = col_character(),
##   ..   PavedDrive = col_character(),
##   ..   WoodDeckSF = col_double(),
##   ..   OpenPorchSF = col_double(),
##   ..   EnclosedPorch = col_double(),
##   ..   `3SsnPorch` = col_double(),
##   ..   ScreenPorch = col_double(),
##   ..   PoolArea = col_double(),
##   ..   PoolQC = col_character(),
##   ..   Fence = col_character(),
##   ..   MiscFeature = col_character(),
##   ..   MiscVal = col_double(),
##   ..   MoSold = col_double(),
##   ..   YrSold = col_double(),
##   ..   SaleType = col_character(),
##   ..   SaleCondition = col_character(),
##   ..   SalePrice = col_double()
##   .. )
test_labels <- test$Id
test$Id <- NULL
train$Id <- NULL
test$SalePrice <- NA

##col name不同,無法rbind
which(names(test)!=names(train))
## [1] 43 44 69
colnames(test)[c(43,44,69)]<-colnames(train)[c(43,44,69)]
##?rbind= combine object by rows
all<- rbind(train, test)


dim(all)
## [1] 2919   80
attach(all)

check the dependent, sales price

ggplot(all, aes(SalePrice))+
  geom_histogram(fill='blue', binwidth = 10000)+
  scale_x_continuous(breaks= seq(0,800000,100000), labels = comma)

##found: right skewed, 大多數人都買便宜房子

summary(all$SalePrice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   34900  129975  163000  180921  214000  755000    1459

check which numerical has the highest cor with dependent

#index vector of the numerical Vars
numericVar<-which(sapply(all,is.numeric))



# sapply(obj, func): 返回一個vector;
# 這個vector的長度=obj的長度,
# 然後sapply自己對obj中的每個小物件執行給定的func,
# func執行後,產生的每一個回應(新的小物件),
# 都會被丟回上面產生的vector的相應編號中
# eg. 這裏是對all裡面的每一個物件,先產生一個相應的數字(index, vector)(有name), 然後對每個all執行is.numeric,會返回一個布林;
# 把這個布林丟到which中,which只會選TRUE

class(numericVar)
## [1] "integer"
# 存著名字待會用
numericVarNames<- names(numericVar)
allNumVar<-all[,numericVar]


##!!!!! 畫相互cor矩陣
corNumVar<-cor(allNumVar, use="pairwise.complete.obs")

corSorted<- as.matrix(sort(corNumVar[,'SalePrice'],decreasing = TRUE))

# CorHigh<-names(which(corSorted>0.5))
CorHigh<-names(which(apply(corSorted,1,function(x) abs(x)>0.5)))
##?apply=
##?abs=算絕對值
corNumVarSorted<-corNumVar[CorHigh,CorHigh]

corrplot.mixed(corNumVarSorted,tl.col='black',tl.pos = 'lt')

explore the highest cor variable OverallQuality

# use box plot to check the subgroup relatio
ggplot(all[!is.na(SalePrice),], aes(factor(OverallQual),SalePrice))+
  geom_boxplot(col='blue')+labs(x='overall quality')+
  scale_y_continuous(breaks = seq(0,800000,100000),labels=comma)

##clear path of correlation bt saleprice and qverall quality, few outliers and no extreme values

explore the second highest cor variable, Above grade (ground) living area square feet

ggplot(all[!is.na(SalePrice),],aes(GrLivArea,SalePrice))+
  geom_point(col='blue',position = 'jitter', alpha=0.25)+
  scale_y_continuous(breaks=seq(0,800000,100000),labels=comma)+
  geom_smooth(method=lm, col='black',aes(group=1))+
  ##?group=
  geom_text_repel(aes(label=ifelse(GrLivArea[!is.na(SalePrice)]>4500,rownames(all),'')))

##?geom_text_repel: 直接在圖上加上點的數值
#ifelse(test, yes, no)= 給一個布林(test),如果是true,返回(yes),是false,返回(no)
##?rownames: 返回每個row的編號(PK)(其實電腦內建有一個編號,只是沒有顯示出來);For a data frame, value for rownames should be a character vector of non-duplicated and non-missing names (this is enforced), and for colnames a character vector of (preferably) unique syntactically-valid names.


###綜合這行的意思就是:把圖上每個點都寫上字,寫什麼?label,
###如果某row"SP不是na"且Gr>4500,返回yes,而yes的label是 那個row的編號
###反之,返回no,no的label是空字串



all[c(524,1299),c('SalePrice', 'GrLivArea','OverallQual')]
##發現有兩間房子很大但很便宜,可能是outliers,但先考慮是否是overal quality的問題,結果叫出來,發現quality 也很好,所以要把這兩筆放到outliers清單

##很強的線性關係,se也很小
check the missing values
##找有missing value 的col( index )
NAcol<-which(colSums(is.na(all))>0)
##which= obj中挑出true的
##此處which 底下的數字是index不是數量!!!!
##is.na驗證每一個value並返回布林,只要這個col有一個na,那計算colsum時就會+1
NaNum<-sort(colSums(sapply(all[NAcol],is.na)),decreasing = TRUE)
##寫成func:


NaNum<-function(df){
  sort(colSums(sapply(all[which(colSums(is.na(df))>0)],is.na)),decreasing = TRUE)
}

cat('There are', length(NAcol), 'columns with missing values')
## There are 35 columns with missing values
##fix NA in 34 predictors( on column is SalesPrice)

Imputing missing data:Pool

all$PoolQC[is.na(all$PoolQC)]<-'None'
table(all$PoolQC)
## 
##   Ex   Fa   Gd None 
##    4    2    4 2909
Qualities<- c('None'=0,'Po'=1,'Fa'=2,'TA'=3,'Gd'=4,'Ex'=5)
all$PoolQC<-as.integer(plyr::revalue(all$PoolQC,Qualities))
#?plyr::revalue(obj1,obj2), 把obj1的named level換成obj2
table(all$PoolQC)
## 
##    0    2    4    5 
## 2909    2    4    4
##下面這裡把all$PoolArea>0 &all$PoolQC == 0 的3rows 重新賦值
all[all$PoolArea>0 &all$PoolQC == 0, "PoolQC"]<-c(2,3,2)
table(all$PoolQC)
## 
##    0    2    3    4    5 
## 2906    4    1    4    4

Imputing missing data: Miscellaneous Feature

table(all$MiscFeature)
## 
## Gar2 Othr Shed TenC 
##    5    4   95    1
all$MiscFeature[is.na(all$MiscFeature)]='None'
all$MiscFeature<-as.factor(all$MiscFeature)

ggplot(all[!is.na(all$SalePrice),],aes(MiscFeature,SalePrice))+
  geom_bar(stat = 'summary', fun.y='median',fill='blue')+
##?stat=summary: x分類中的平均y值
  scale_y_continuous(breaks=seq(0,800000,50000),labels = comma)+
  #######下面這串看不懂,?geom_label:
  geom_label(stat='count',aes(label=..count..,y=..count..))

##seems like MiscFeatures are irrelevnt to saleprice, eventhough house with Tennis court is much more expensive, but there is only one house with TC; House with Shed are cheaper, but it probably because shed is alternative of garrage.

Imputing missing data: LotFrontage

##LotFrontage
which(is.na(all$LotFrontage))
##   [1]    8   13   15   17   25   32   43   44   51   65   67   77   85   96
##  [15]  101  105  112  114  117  121  127  132  134  137  148  150  153  154
##  [29]  161  167  170  171  178  181  187  192  204  208  209  215  219  222
##  [43]  235  238  245  250  270  288  289  294  308  309  311  320  329  331
##  [57]  336  343  347  348  352  357  361  362  365  367  370  371  376  385
##  [71]  393  394  405  406  413  422  427  448  453  458  459  460  466  471
##  [85]  485  491  497  517  519  530  538  539  540  542  546  560  561  565
##  [99]  570  581  594  611  612  613  617  624  627  642  646  661  667  669
## [113]  673  680  683  686  688  691  707  710  715  721  722  727  735  746
## [127]  747  752  758  771  784  786  790  792  795  812  817  818  823  829
## [141]  841  846  852  854  856  857  860  866  869  880  883  894  901  905
## [155]  909  912  918  926  928  929  930  940  942  945  954  962  968  976
## [169]  981  984  989  997  998 1004 1007 1018 1019 1025 1031 1033 1034 1036
## [183] 1038 1042 1046 1058 1060 1065 1078 1085 1087 1098 1109 1111 1117 1123
## [197] 1125 1139 1142 1144 1147 1149 1154 1155 1162 1165 1178 1181 1191 1194
## [211] 1207 1214 1231 1234 1245 1248 1252 1254 1261 1263 1269 1271 1272 1273
## [225] 1277 1278 1287 1288 1291 1301 1302 1310 1313 1319 1322 1343 1347 1349
## [239] 1355 1357 1358 1359 1363 1366 1369 1374 1382 1384 1397 1408 1418 1420
## [253] 1424 1425 1430 1432 1442 1444 1447 1467 1501 1502 1506 1508 1513 1520
## [267] 1536 1543 1559 1564 1566 1568 1574 1580 1585 1593 1607 1613 1628 1635
## [281] 1638 1640 1643 1644 1645 1648 1649 1660 1690 1691 1692 1696 1699 1701
## [295] 1729 1732 1733 1734 1735 1737 1738 1740 1741 1744 1747 1751 1755 1758
## [309] 1759 1762 1769 1820 1824 1834 1841 1844 1847 1848 1849 1862 1863 1864
## [323] 1873 1879 1882 1884 1886 1903 1911 1912 1923 1937 1942 1946 1948 1950
## [337] 1956 1958 1985 1986 1989 1990 1993 1997 2000 2024 2030 2031 2040 2042
## [351] 2043 2045 2050 2053 2065 2075 2111 2112 2123 2129 2132 2138 2141 2142
## [365] 2143 2147 2149 2156 2158 2164 2165 2167 2168 2171 2172 2174 2175 2176
## [379] 2179 2187 2203 2204 2205 2223 2224 2235 2242 2246 2251 2254 2255 2258
## [393] 2259 2269 2280 2301 2321 2326 2328 2358 2362 2373 2380 2388 2390 2396
## [407] 2397 2404 2422 2424 2432 2447 2460 2467 2481 2484 2485 2491 2492 2493
## [421] 2494 2499 2513 2515 2522 2531 2535 2536 2548 2568 2569 2571 2573 2584
## [435] 2587 2588 2595 2597 2598 2603 2607 2612 2615 2617 2618 2621 2626 2635
## [449] 2637 2663 2673 2674 2677 2678 2681 2684 2685 2701 2704 2705 2707 2708
## [463] 2709 2710 2715 2716 2725 2728 2738 2739 2742 2765 2808 2811 2812 2813
## [477] 2815 2816 2819 2840 2846 2848 2851 2901 2902 2909
ggplot(all[!is.na(all$LotFrontage),],aes(as.factor(Neighborhood),LotFrontage))+
  geom_bar(stat='summary',fun.y='median',fill='blue')+
  xlab('neighborhood')+
  theme(axis.text.x= element_text(angle = 45,hjust = 1))

##theme 調整底下label的顯示方式,讓它不會疊在一起,angle表示45度斜角顯示,hjust表示下移一單位


for (i in 1:nrow(all)){
  if(is.na(all$LotFrontage[i])){
    all$LotFrontage[i]<-
      as.integer(median(all$LotFrontage[all$Neighborhood==all$Neighborhood[i]],na.rm=TRUE))
  }
}

##把lotfrontage 裡面的missing value 換成中位數

for (i in 1:nrow(all)){
        if(is.na(all$LotFrontage[i])){
               all$LotFrontage[i] <- as.integer(median(all$LotFrontage[all$Neighborhood==all$Neighborhood[i]], na.rm=TRUE)) 
        }
}

sum(is.na(all$LotFrontage))
## [1] 0

Imputing missing data: LotShape

table(all$LotShape)
## 
##  IR1  IR2  IR3  Reg 
##  968   76   16 1859
sum(is.na(LotShape))
## [1] 0
all$LotShape<- as.integer(plyr::revalue(all$LotShape,c('IR3'=0, 'IR2'=1,'IR1'=2,'Reg'=3)))
table(all$LotShape)
## 
##    0    1    2    3 
##   16   76  968 1859
ggplot(all[!is.na(SalePrice),], aes(as.factor(LotShape),SalePrice))+
  geom_boxplot()+xlab('LotShape')+
  scale_y_continuous(labels = comma)

Imputing missing data: LotConfig

table(all$LotConfig)
## 
##  Corner CulDSac     FR2     FR3  Inside 
##     511     176      85      14    2133
ggplot(all[!is.na(all$SalePrice),], aes(as.factor(LotConfig),SalePrice))+
     geom_bar(stat='summary',fun.y='median',fill='blue')+
     scale_y_continuous(breaks=seq(0,800000,50000),labels = comma)+
     geom_label(stat='count',aes(label=..count..,y=..count..))

Imputing missing data: Garage

all$GarageYrBlt[is.na(all$GarageYrBlt)] <- all$YearBuilt[is.na(all$GarageYrBlt)]
# As NAs mean ‘No Garage’ for character variables, I now want to find out where the differences between the 157 NA GarageType and the other 3 character variables with 159 NAs come from.
#check if all 157 NAs are the same observations among the variables with 157/159 NAs
length(which(is.na(all$GarageType)&is.na(all$GarageFinish)&is.na(all$GarageCond)& is.na(all$GarageQual)))
## [1] 157
#Find the 2 additional NAs

##?kable
kable(all[!is.na(GarageType)&is.na(GarageFinish),c('GarageCars', 'GarageArea', 'GarageType', 'GarageCond', 'GarageQual', 'GarageFinish')])
GarageCars GarageArea GarageType GarageCond GarageQual GarageFinish
1 360 Detchd NA NA NA
NA NA Detchd NA NA NA
##check the row index
which(!is.na(GarageType)&is.na(GarageFinish))
## [1] 2127 2577
#The 157 NAs within GarageType all turn out to be NA in GarageCondition, GarageQuality, and GarageFinish as well. The differences are found in houses 2127 and 2577. As you can see, house 2127 actually does seem to have a Garage and house 2577 does not. Therefore, there should be 158 houses without a Garage. To fix house 2127, I will imputate the most common values (modes) for GarageCond, GarageQual, and GarageFinish.

# imputate 2127 the most common values (modes) for GarageCond, GarageQual, and GarageFinish.

#在table前加一個負號,這樣table本來是從少到多排序,就會被反過來,就能選第一個(是最多)
names(sort(-table(all$GarageCond)))[1]
## [1] "TA"
names(sort(-table(all$GarageQual)))[1]
## [1] "TA"
names(sort(-table(all$GarageFinish)))[1]
## [1] "Unf"
all[2127,c('GarageCond',"GarageQual","GarageFinish")]<-
  c(names(sort(-table(all$GarageCond)))[1],
    names(sort(-table(all$GarageQual)))[1],
    names(sort(-table(all$GarageFinish)))[1])

kable(all[2127,c('GarageYrBlt', 'GarageCars', 'GarageArea', 'GarageType', 'GarageCond', 'GarageQual', 'GarageFinish')])
GarageYrBlt GarageCars GarageArea GarageType GarageCond GarageQual GarageFinish
1910 1 360 Detchd TA TA Unf
#Both have 1 NA. As you can see above, it is house 2577 for both variables. The problem probably occured as the GarageType for this house is “detached”, while all other Garage-variables seem to indicate that this house has no Garage.
which(is.na(GarageCars))
## [1] 2577
which(is.na(GarageArea))
## [1] 2577
which(is.na(GarageType))
##   [1]   40   49   79   89   90  100  109  126  128  141  149  156  164  166
##  [15]  199  211  242  251  288  292  308  376  387  394  432  435  442  465
##  [29]  496  521  529  534  536  563  583  614  615  621  636  637  639  650
##  [43]  706  711  739  751  785  827  844  922  943  955  961  969  971  977
##  [57] 1010 1012 1031 1039 1097 1124 1132 1138 1144 1174 1180 1219 1220 1235
##  [71] 1258 1284 1324 1326 1327 1338 1350 1408 1450 1451 1454 1514 1532 1540
##  [85] 1553 1557 1559 1561 1591 1594 1595 1615 1616 1718 1722 1788 1809 1811
##  [99] 1812 1820 1823 1832 1835 1837 1840 1848 1894 2011 2082 2091 2094 2097
## [113] 2100 2105 2136 2152 2154 2190 2191 2192 2193 2194 2213 2239 2247 2354
## [127] 2355 2399 2400 2423 2427 2553 2554 2558 2576 2580 2604 2610 2692 2694
## [141] 2709 2768 2772 2790 2792 2800 2860 2863 2871 2889 2892 2893 2894 2910
## [155] 2914 2915 2918
which(is.na(all[,c('GarageCars','GarageArea')]))
## [1] 2577 5496
all[c(2577,5946),c('GarageCars','GarageArea',"GarageCond","GarageFinish","GarageYrBlt","GarageQual","GarageType")]
#fixing 3 values for house 2577
all[2577,c("GarageCars","GarageArea","GarageType")]<-c(0,0,NA)

#check if NAs of the character variables are now all 158
length(which(is.na(all$GarageType) & is.na(all$GarageFinish) & is.na(all$GarageCond) & is.na(all$GarageQual)))
## [1] 158
#Now, the 4 character variables related to garage all have the same set of 158 NAs, which correspond to ‘No Garage’. I will fix all of them in the remainder of this section
#The values do not seem ordinal, so I will convert into a factor.
all$GarageType[is.na(all$GarageType)]<-'No Garage'
all$GarageType<-as.factor(all$GarageType)
table(all$GarageType)
## 
##    2Types    Attchd   Basment   BuiltIn   CarPort    Detchd No Garage 
##        23      1723        36       186        15       778       158
all$GarageFinish[is.na(all$GarageFinish)]<-'None'
Finish<-c('None'=0,'Unf'=1,'RFn'=2,'Fin'=3)
all$GarageFinish<-as.integer(plyr::revalue(all$GarageFinish,Finish))
table(all$GarageFinish)
## 
##    0    1    2    3 
##  158 1231  811  719
all$GarageQual[is.na(all$GarageQual)]<-'None'
all$GarageQual<-as.integer(plyr::revalue(all$GarageQual,Qualities))
table(all$GarageQual)
## 
##    0    1    2    3    4    5 
##  158    5  124 2605   24    3
all$GarageCond[is.na(all$GarageCond)]<-'None'
all$GarageCond<-as.integer((plyr::revalue(all$GarageCond,Qualities)))
table(all$GarageCond)
## 
##    0    1    2    3    4    5 
##  158   14   74 2655   15    3

Imputing missing data: Alley

all$Alley[is.na(Alley)]<-'None'
all$Alley<-as.factor(all$Alley)
table(all$Alley)
## 
## Grvl None Pave 
##  120 2721   78
ggplot(all[!is.na(SalePrice),],aes(Alley,SalePrice))+
  geom_bar(stat='summary',fun.y='median',fill='blue')+
  scale_y_continuous(breaks=seq(0,200000,50000),labels=comma)

##pave對價格影響不顯著, grvl 價格較低(是否有其他特徵影響??)

Imputing missing data:Fence

factor(all$Fence)
##    [1] <NA>  <NA>  <NA>  <NA>  <NA>  MnPrv <NA>  <NA>  <NA>  <NA>  <NA> 
##   [12] <NA>  <NA>  <NA>  GdWo  GdPrv <NA>  <NA>  <NA>  MnPrv <NA>  GdPrv
##   [23] <NA>  <NA>  MnPrv <NA>  <NA>  <NA>  <NA>  <NA>  MnPrv MnPrv <NA> 
##   [34] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  GdWo  <NA>  MnPrv MnPrv
##   [45] <NA>  <NA>  <NA>  <NA>  <NA>  MnPrv <NA>  MnPrv <NA>  <NA>  MnPrv
##   [56] <NA>  <NA>  <NA>  <NA>  MnPrv <NA>  <NA>  <NA>  GdPrv GdPrv <NA> 
##   [67] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  GdWo  <NA>  GdWo  <NA> 
##   [78] MnPrv <NA>  MnPrv <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
##   [89] MnPrv <NA>  <NA>  GdWo  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
##  [100] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  MnPrv
##  [111] <NA>  <NA>  <NA>  MnPrv MnPrv <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
##  [122] MnPrv <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  MnWw  GdPrv <NA> 
##  [133] <NA>  <NA>  MnPrv MnPrv <NA>  <NA>  GdPrv MnPrv <NA>  <NA>  MnPrv
##  [144] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  MnPrv <NA>  GdWo  <NA> 
##  [155] <NA>  <NA>  <NA>  <NA>  GdWo  <NA>  <NA>  <NA>  <NA>  MnPrv <NA> 
##  [166] <NA>  GdWo  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  MnPrv
##  [177] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  MnPrv <NA>  MnPrv GdPrv GdPrv
##  [188] MnPrv <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  GdPrv
##  [199] MnPrv <NA>  <NA>  MnPrv <NA>  <NA>  MnWw  <NA>  <NA>  GdWo  <NA> 
##  [210] MnPrv <NA>  <NA>  <NA>  <NA>  MnPrv MnPrv <NA>  <NA>  <NA>  <NA> 
##  [221] <NA>  <NA>  MnPrv <NA>  <NA>  <NA>  <NA>  <NA>  MnPrv <NA>  <NA> 
##  [232] <NA>  <NA>  MnPrv <NA>  <NA>  <NA>  <NA>  <NA>  MnPrv <NA>  <NA> 
##  [243] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  GdWo  <NA>  <NA> 
##  [254] MnPrv <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  GdWo  <NA>  GdWo  <NA> 
##  [265] <NA>  GdPrv <NA>  <NA>  <NA>  GdPrv <NA>  <NA>  <NA>  MnPrv <NA> 
##  [276] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
##  [287] GdPrv <NA>  MnPrv <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  GdPrv MnPrv
##  [298] <NA>  <NA>  GdPrv <NA>  <NA>  <NA>  GdWo  <NA>  <NA>  <NA>  MnPrv
##  [309] <NA>  <NA>  <NA>  <NA>  MnPrv <NA>  <NA>  <NA>  GdPrv <NA>  <NA> 
##  [320] <NA>  <NA>  <NA>  MnPrv <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  GdWo 
##  [331] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
##  [342] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
##  [353] <NA>  <NA>  <NA>  <NA>  <NA>  MnPrv <NA>  <NA>  MnPrv <NA>  <NA> 
##  [364] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  MnPrv GdWo 
##  [375] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
##  [386] <NA>  <NA>  GdWo  <NA>  <NA>  MnPrv <NA>  MnPrv GdWo  <NA>  <NA> 
##  [397] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  GdPrv <NA>  <NA>  GdWo  <NA> 
##  [408] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
##  [419] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
##  [430] <NA>  <NA>  MnPrv <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  MnPrv GdPrv
##  [441] <NA>  <NA>  <NA>  <NA>  <NA>  MnPrv <NA>  <NA>  <NA>  <NA>  <NA> 
##  [452] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  MnPrv <NA>  <NA>  MnPrv
##  [463] GdWo  <NA>  <NA>  <NA>  GdWo  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
##  [474] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  MnPrv <NA>  <NA>  <NA>  <NA> 
##  [485] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  MnPrv <NA>  <NA>  <NA> 
##  [496] GdWo  <NA>  GdPrv MnPrv MnWw  <NA>  <NA>  GdPrv GdWo  <NA>  <NA> 
##  [507] <NA>  <NA>  <NA>  MnPrv <NA>  <NA>  <NA>  <NA>  MnPrv <NA>  GdPrv
##  [518] <NA>  MnPrv MnPrv <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
##  [529] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  MnWw  <NA> 
##  [540] MnPrv <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  MnPrv <NA> 
##  [551] <NA>  <NA>  <NA>  MnPrv <NA>  <NA>  MnWw  <NA>  <NA>  <NA>  <NA> 
##  [562] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
##  [573] <NA>  <NA>  GdWo  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
##  [584] <NA>  <NA>  <NA>  MnWw  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
##  [595] MnPrv <NA>  <NA>  <NA>  <NA>  GdPrv <NA>  <NA>  <NA>  <NA>  <NA> 
##  [606] <NA>  GdPrv <NA>  <NA>  <NA>  <NA>  MnPrv <NA>  <NA>  <NA>  MnPrv
##  [617] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  GdWo 
##  [628] <NA>  <NA>  GdPrv MnPrv <NA>  <NA>  <NA>  GdPrv <NA>  <NA>  <NA> 
##  [639] MnPrv <NA>  <NA>  <NA>  MnPrv GdWo  <NA>  <NA>  <NA>  <NA>  <NA> 
##  [650] MnPrv <NA>  MnPrv <NA>  MnPrv <NA>  <NA>  MnPrv MnPrv <NA>  MnPrv
##  [661] <NA>  <NA>  <NA>  MnPrv <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
##  [672] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
##  [683] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
##  [694] <NA>  MnPrv <NA>  <NA>  <NA>  MnPrv <NA>  <NA>  <NA>  <NA>  MnPrv
##  [705] <NA>  <NA>  <NA>  <NA>  <NA>  MnPrv <NA>  <NA>  <NA>  <NA>  <NA> 
##  [716] MnPrv <NA>  MnPrv <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
##  [727] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  GdWo  <NA>  <NA>  <NA> 
##  [738] <NA>  <NA>  <NA>  GdPrv GdPrv GdPrv <NA>  <NA>  <NA>  <NA>  <NA> 
##  [749] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  GdPrv <NA> 
##  [760] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  MnPrv <NA>  <NA>  <NA> 
##  [771] <NA>  <NA>  MnPrv <NA>  <NA>  <NA>  <NA>  MnPrv <NA>  <NA>  <NA> 
##  [782] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
##  [793] <NA>  <NA>  <NA>  MnPrv MnPrv <NA>  <NA>  MnPrv <NA>  <NA>  <NA> 
##  [804] <NA>  GdWo  <NA>  <NA>  <NA>  GdWo  MnPrv GdPrv <NA>  <NA>  <NA> 
##  [815] <NA>  <NA>  <NA>  <NA>  MnPrv <NA>  <NA>  <NA>  <NA>  GdWo  <NA> 
##  [826] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  GdWo  <NA> 
##  [837] GdWo  <NA>  <NA>  <NA>  <NA>  <NA>  GdPrv <NA>  <NA>  <NA>  <NA> 
##  [848] <NA>  <NA>  <NA>  <NA>  <NA>  MnPrv MnPrv <NA>  <NA>  MnPrv <NA> 
##  [859] MnPrv <NA>  GdPrv <NA>  GdPrv GdPrv <NA>  MnPrv <NA>  <NA>  <NA> 
##  [870] GdPrv <NA>  <NA>  GdWo  <NA>  <NA>  <NA>  <NA>  <NA>  MnPrv GdWo 
##  [881] <NA>  <NA>  MnPrv <NA>  GdWo  <NA>  <NA>  <NA>  <NA>  <NA>  MnPrv
##  [892] <NA>  MnPrv GdPrv <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
##  [903] <NA>  <NA>  MnPrv MnPrv <NA>  <NA>  MnPrv <NA>  <NA>  <NA>  MnPrv
##  [914] <NA>  <NA>  <NA>  <NA>  <NA>  GdPrv <NA>  <NA>  GdPrv <NA>  <NA> 
##  [925] <NA>  <NA>  <NA>  GdPrv <NA>  <NA>  <NA>  GdWo  <NA>  <NA>  <NA> 
##  [936] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  GdPrv <NA>  <NA>  <NA>  MnWw 
##  [947] <NA>  <NA>  <NA>  <NA>  <NA>  MnPrv MnPrv MnPrv <NA>  <NA>  GdPrv
##  [958] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
##  [969] GdWo  <NA>  <NA>  <NA>  <NA>  <NA>  GdPrv <NA>  <NA>  <NA>  <NA> 
##  [980] MnPrv <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
##  [991] <NA>  <NA>  MnWw  <NA>  <NA>  MnPrv <NA>  <NA>  <NA>  <NA>  <NA> 
## [1002] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
## [1013] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
## [1024] <NA>  <NA>  MnPrv <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
## [1035] MnPrv <NA>  <NA>  <NA>  <NA>  <NA>  GdPrv <NA>  <NA>  <NA>  MnPrv
## [1046] <NA>  <NA>  <NA>  GdPrv <NA>  <NA>  <NA>  GdPrv <NA>  <NA>  MnPrv
## [1057] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  MnPrv MnPrv <NA>  <NA> 
## [1068] <NA>  GdPrv <NA>  <NA>  <NA>  <NA>  MnPrv <NA>  <NA>  <NA>  <NA> 
## [1079] <NA>  <NA>  <NA>  MnPrv <NA>  MnPrv <NA>  <NA>  <NA>  <NA>  <NA> 
## [1090] <NA>  <NA>  <NA>  <NA>  MnPrv <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
## [1101] <NA>  <NA>  MnWw  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
## [1112] <NA>  <NA>  <NA>  MnPrv <NA>  <NA>  <NA>  <NA>  GdWo  MnPrv <NA> 
## [1123] MnPrv <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  MnPrv MnPrv <NA> 
## [1134] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
## [1145] MnPrv <NA>  <NA>  <NA>  <NA>  <NA>  GdPrv <NA>  <NA>  <NA>  GdPrv
## [1156] <NA>  MnPrv <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
## [1167] <NA>  <NA>  <NA>  <NA>  GdPrv <NA>  <NA>  MnPrv <NA>  <NA>  MnPrv
## [1178] <NA>  <NA>  <NA>  GdPrv <NA>  MnPrv <NA>  <NA>  <NA>  <NA>  GdPrv
## [1189] <NA>  <NA>  <NA>  <NA>  GdWo  <NA>  MnPrv <NA>  <NA>  <NA>  <NA> 
## [1200] <NA>  <NA>  <NA>  <NA>  <NA>  MnPrv <NA>  MnPrv <NA>  <NA>  <NA> 
## [1211] GdPrv <NA>  <NA>  <NA>  <NA>  MnPrv <NA>  <NA>  <NA>  <NA>  <NA> 
## [1222] <NA>  GdWo  MnPrv <NA>  MnWw  <NA>  <NA>  <NA>  GdWo  <NA>  GdWo 
## [1233] <NA>  MnPrv <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  MnPrv
## [1244] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  GdWo  MnPrv
## [1255] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  GdPrv <NA> 
## [1266] <NA>  MnPrv <NA>  GdWo  GdWo  <NA>  <NA>  MnPrv GdPrv <NA>  <NA> 
## [1277] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
## [1288] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  GdWo  <NA>  GdWo  MnPrv <NA> 
## [1299] <NA>  GdPrv <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  GdWo 
## [1310] GdWo  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
## [1321] <NA>  <NA>  <NA>  <NA>  <NA>  MnPrv <NA>  MnPrv MnPrv <NA>  <NA> 
## [1332] <NA>  <NA>  MnPrv <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  GdWo  <NA> 
## [1343] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
## [1354] <NA>  <NA>  GdPrv GdWo  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
## [1365] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  MnPrv <NA>  <NA>  <NA> 
## [1376] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  MnPrv MnPrv
## [1387] MnPrv GdWo  <NA>  <NA>  <NA>  <NA>  MnWw  <NA>  <NA>  <NA>  <NA> 
## [1398] <NA>  GdPrv MnPrv <NA>  <NA>  <NA>  <NA>  MnPrv <NA>  MnPrv MnPrv
## [1409] <NA>  MnPrv <NA>  MnPrv <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
## [1420] <NA>  <NA>  <NA>  <NA>  GdPrv GdWo  <NA>  <NA>  <NA>  MnPrv <NA> 
## [1431] <NA>  <NA>  <NA>  <NA>  <NA>  GdPrv GdWo  <NA>  MnPrv <NA>  <NA> 
## [1442] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  GdWo  <NA>  <NA>  <NA> 
## [1453] <NA>  <NA>  <NA>  <NA>  MnPrv GdPrv <NA>  <NA>  MnPrv <NA>  MnPrv
## [1464] <NA>  <NA>  <NA>  GdPrv <NA>  <NA>  MnPrv <NA>  <NA>  <NA>  <NA> 
## [1475] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
## [1486] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  GdPrv <NA>  <NA>  <NA> 
## [1497] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
## [1508] <NA>  GdWo  <NA>  <NA>  GdPrv <NA>  <NA>  GdWo  <NA>  <NA>  <NA> 
## [1519] MnPrv GdPrv <NA>  GdWo  <NA>  <NA>  GdWo  <NA>  <NA>  GdWo  <NA> 
## [1530] <NA>  <NA>  MnPrv <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
## [1541] <NA>  MnPrv <NA>  MnPrv MnPrv <NA>  <NA>  <NA>  <NA>  MnPrv <NA> 
## [1552] GdPrv <NA>  GdPrv <NA>  <NA>  <NA>  MnPrv <NA>  <NA>  <NA>  <NA> 
## [1563] <NA>  <NA>  MnPrv <NA>  <NA>  <NA>  <NA>  MnPrv MnPrv <NA>  <NA> 
## [1574] <NA>  MnPrv <NA>  <NA>  <NA>  GdPrv <NA>  MnPrv MnPrv <NA>  <NA> 
## [1585] <NA>  <NA>  MnPrv MnPrv <NA>  <NA>  MnPrv MnPrv <NA>  <NA>  <NA> 
## [1596] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  MnPrv <NA>  <NA>  <NA>  GdPrv
## [1607] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
## [1618] <NA>  MnPrv <NA>  GdWo  <NA>  GdWo  <NA>  <NA>  <NA>  <NA>  <NA> 
## [1629] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
## [1640] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  GdWo  <NA>  <NA>  <NA>  GdPrv
## [1651] GdWo  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
## [1662] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  MnPrv
## [1673] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
## [1684] <NA>  <NA>  <NA>  <NA>  <NA>  MnPrv <NA>  <NA>  <NA>  <NA>  <NA> 
## [1695] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
## [1706] GdPrv <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
## [1717] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
## [1728] <NA>  <NA>  <NA>  MnPrv <NA>  <NA>  MnPrv GdWo  <NA>  MnPrv GdPrv
## [1739] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
## [1750] MnPrv <NA>  GdPrv <NA>  <NA>  <NA>  GdWo  <NA>  MnPrv <NA>  MnPrv
## [1761] <NA>  MnPrv <NA>  MnPrv MnPrv GdPrv GdWo  <NA>  <NA>  <NA>  GdWo 
## [1772] GdWo  <NA>  <NA>  <NA>  <NA>  <NA>  MnPrv MnPrv GdWo  GdWo  <NA> 
## [1783] <NA>  <NA>  GdWo  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  GdWo  MnPrv
## [1794] <NA>  <NA>  <NA>  <NA>  GdWo  <NA>  GdWo  <NA>  <NA>  <NA>  <NA> 
## [1805] <NA>  <NA>  <NA>  MnPrv <NA>  MnPrv <NA>  <NA>  MnPrv <NA>  <NA> 
## [1816] GdWo  <NA>  <NA>  <NA>  <NA>  <NA>  MnPrv <NA>  GdWo  <NA>  <NA> 
## [1827] MnPrv <NA>  <NA>  <NA>  <NA>  <NA>  MnPrv <NA>  <NA>  <NA>  <NA> 
## [1838] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  MnPrv <NA>  <NA> 
## [1849] <NA>  GdPrv <NA>  MnPrv <NA>  <NA>  <NA>  MnPrv <NA>  <NA>  GdPrv
## [1860] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
## [1871] <NA>  <NA>  <NA>  <NA>  GdPrv <NA>  <NA>  <NA>  MnPrv <NA>  <NA> 
## [1882] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  MnPrv <NA>  MnPrv
## [1893] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  MnPrv GdPrv MnPrv <NA> 
## [1904] MnPrv <NA>  <NA>  <NA>  GdPrv GdPrv MnPrv <NA>  GdPrv <NA>  <NA> 
## [1915] <NA>  <NA>  <NA>  MnPrv GdWo  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
## [1926] <NA>  MnPrv <NA>  MnPrv <NA>  GdPrv <NA>  MnPrv <NA>  <NA>  <NA> 
## [1937] <NA>  <NA>  <NA>  <NA>  <NA>  GdPrv <NA>  <NA>  <NA>  <NA>  <NA> 
## [1948] <NA>  <NA>  <NA>  <NA>  <NA>  MnPrv <NA>  <NA>  MnPrv GdWo  GdPrv
## [1959] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
## [1970] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
## [1981] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
## [1992] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
## [2003] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
## [2014] <NA>  <NA>  <NA>  <NA>  MnPrv <NA>  <NA>  <NA>  <NA>  GdPrv <NA> 
## [2025] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
## [2036] <NA>  <NA>  <NA>  <NA>  <NA>  GdWo  <NA>  <NA>  GdWo  <NA>  <NA> 
## [2047] <NA>  <NA>  <NA>  <NA>  GdWo  <NA>  MnPrv <NA>  MnPrv <NA>  MnPrv
## [2058] MnPrv <NA>  <NA>  <NA>  <NA>  MnPrv <NA>  GdWo  <NA>  <NA>  <NA> 
## [2069] GdPrv <NA>  <NA>  GdWo  GdWo  MnPrv <NA>  <NA>  <NA>  <NA>  GdWo 
## [2080] GdWo  MnWw  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  MnPrv <NA> 
## [2091] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  MnPrv <NA> 
## [2102] MnPrv <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  GdPrv <NA>  <NA> 
## [2113] <NA>  MnPrv GdPrv <NA>  GdPrv <NA>  <NA>  MnPrv MnPrv <NA>  <NA> 
## [2124] MnPrv <NA>  <NA>  MnPrv <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  MnPrv
## [2135] <NA>  <NA>  <NA>  MnPrv <NA>  MnPrv GdWo  MnPrv <NA>  <NA>  <NA> 
## [2146] <NA>  <NA>  MnPrv MnPrv <NA>  <NA>  <NA>  MnPrv <NA>  GdWo  GdPrv
## [2157] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
## [2168] <NA>  <NA>  <NA>  <NA>  MnPrv <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
## [2179] <NA>  <NA>  <NA>  <NA>  <NA>  MnPrv <NA>  <NA>  <NA>  MnPrv <NA> 
## [2190] <NA>  <NA>  <NA>  <NA>  <NA>  MnPrv <NA>  <NA>  <NA>  <NA>  MnPrv
## [2201] <NA>  GdPrv <NA>  <NA>  <NA>  MnPrv <NA>  <NA>  MnPrv GdPrv MnPrv
## [2212] <NA>  <NA>  MnPrv <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
## [2223] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
## [2234] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
## [2245] GdWo  GdWo  <NA>  GdWo  <NA>  <NA>  <NA>  <NA>  <NA>  MnPrv <NA> 
## [2256] <NA>  <NA>  <NA>  <NA>  MnPrv <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
## [2267] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  MnPrv <NA>  <NA>  <NA>  <NA> 
## [2278] GdWo  <NA>  GdWo  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
## [2289] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
## [2300] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
## [2311] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
## [2322] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
## [2333] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
## [2344] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
## [2355] <NA>  <NA>  <NA>  GdPrv MnPrv <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
## [2366] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
## [2377] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  MnPrv <NA>  <NA> 
## [2388] <NA>  GdWo  <NA>  GdPrv <NA>  MnPrv GdPrv MnPrv MnPrv <NA>  MnPrv
## [2399] MnPrv MnPrv <NA>  MnPrv <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
## [2410] <NA>  GdWo  <NA>  <NA>  <NA>  <NA>  <NA>  MnPrv <NA>  <NA>  GdWo 
## [2421] GdPrv MnPrv <NA>  GdWo  <NA>  <NA>  <NA>  MnPrv <NA>  <NA>  <NA> 
## [2432] <NA>  <NA>  <NA>  MnPrv <NA>  <NA>  <NA>  <NA>  <NA>  MnPrv <NA> 
## [2443] <NA>  MnPrv <NA>  MnPrv MnPrv <NA>  <NA>  <NA>  MnPrv MnPrv <NA> 
## [2454] <NA>  <NA>  <NA>  <NA>  GdWo  MnPrv <NA>  MnPrv GdPrv <NA>  MnPrv
## [2465] <NA>  <NA>  GdPrv MnPrv MnPrv <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
## [2476] MnPrv MnPrv <NA>  MnPrv <NA>  MnPrv GdPrv <NA>  <NA>  MnPrv <NA> 
## [2487] <NA>  MnPrv <NA>  <NA>  <NA>  GdPrv <NA>  <NA>  <NA>  <NA>  <NA> 
## [2498] <NA>  <NA>  <NA>  <NA>  <NA>  MnPrv <NA>  <NA>  <NA>  <NA>  <NA> 
## [2509] <NA>  <NA>  <NA>  <NA>  GdPrv <NA>  <NA>  <NA>  <NA>  GdPrv <NA> 
## [2520] <NA>  <NA>  <NA>  <NA>  <NA>  MnPrv MnPrv MnPrv <NA>  <NA>  <NA> 
## [2531] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
## [2542] <NA>  <NA>  <NA>  <NA>  GdPrv <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
## [2553] <NA>  GdWo  <NA>  MnPrv <NA>  <NA>  MnPrv <NA>  MnPrv <NA>  MnPrv
## [2564] GdPrv <NA>  <NA>  MnPrv <NA>  <NA>  MnPrv <NA>  <NA>  GdWo  <NA> 
## [2575] MnPrv <NA>  MnPrv <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  MnPrv <NA> 
## [2586] <NA>  <NA>  <NA>  <NA>  <NA>  GdPrv <NA>  <NA>  <NA>  <NA>  <NA> 
## [2597] <NA>  <NA>  <NA>  GdPrv <NA>  <NA>  <NA>  <NA>  GdWo  MnPrv MnPrv
## [2608] GdPrv GdPrv GdWo  <NA>  MnPrv MnPrv <NA>  <NA>  <NA>  <NA>  <NA> 
## [2619] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  MnPrv <NA>  <NA>  <NA> 
## [2630] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  GdPrv <NA>  <NA>  <NA>  GdWo 
## [2641] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  GdPrv <NA>  <NA>  <NA>  <NA> 
## [2652] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
## [2663] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
## [2674] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  GdPrv
## [2685] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
## [2696] <NA>  GdPrv <NA>  MnPrv <NA>  <NA>  <NA>  <NA>  MnPrv MnPrv MnPrv
## [2707] <NA>  MnPrv GdWo  <NA>  GdPrv <NA>  <NA>  <NA>  <NA>  <NA>  GdPrv
## [2718] <NA>  MnPrv <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  MnPrv <NA> 
## [2729] <NA>  MnPrv MnPrv GdWo  <NA>  MnPrv <NA>  MnPrv MnPrv <NA>  <NA> 
## [2740] <NA>  GdPrv <NA>  MnPrv <NA>  <NA>  MnPrv <NA>  GdWo  <NA>  MnPrv
## [2751] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
## [2762] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  GdWo  <NA> 
## [2773] <NA>  <NA>  <NA>  <NA>  <NA>  MnPrv <NA>  MnPrv <NA>  <NA>  <NA> 
## [2784] <NA>  MnPrv <NA>  GdWo  <NA>  <NA>  <NA>  <NA>  <NA>  GdPrv <NA> 
## [2795] <NA>  <NA>  <NA>  <NA>  MnPrv <NA>  <NA>  GdWo  <NA>  <NA>  <NA> 
## [2806] <NA>  <NA>  <NA>  GdPrv GdWo  MnPrv GdPrv <NA>  <NA>  <NA>  <NA> 
## [2817] <NA>  MnPrv <NA>  <NA>  <NA>  <NA>  <NA>  MnPrv <NA>  <NA>  <NA> 
## [2828] MnPrv <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
## [2839] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  MnPrv <NA>  <NA>  <NA>  <NA> 
## [2850] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
## [2861] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  GdPrv <NA>  MnPrv <NA> 
## [2872] <NA>  <NA>  MnPrv <NA>  MnPrv <NA>  <NA>  <NA>  <NA>  MnPrv <NA> 
## [2883] <NA>  GdWo  MnPrv <NA>  <NA>  MnPrv GdWo  MnPrv <NA>  <NA>  <NA> 
## [2894] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
## [2905] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  GdPrv <NA> 
## [2916] <NA>  <NA>  MnPrv <NA> 
## Levels: GdPrv GdWo MnPrv MnWw
all$Fence[is.na(all$Fence)]<-'None'
table(all$Fence)
## 
## GdPrv  GdWo MnPrv  MnWw  None 
##   118   112   329    12  2348
ggplot(all[!is.na(SalePrice),],aes(Fence,SalePrice))+
  geom_bar(stat='summary',fun.y='median', fill='blue')+
  scale_y_continuous(breaks=seq(0,200000,50000),labels=comma)

detach("package:Rmisc", unload=TRUE)
detach("package:plyr", unload=TRUE)
all[!is.na(SalePrice),]%>% group_by(Fence)%>%summarise(median=median(SalePrice),counts=n())
all$Fence<-factor(all$Fence)


#Fence有無無顯著影響

Imputing missing data:Fire place

library(plyr)
table(all$FireplaceQu)
## 
##  Ex  Fa  Gd  Po  TA 
##  43  74 744  46 592
all$FireplaceQu[is.na(all$FireplaceQu)]<-"None"
table(all$FireplaceQu)
## 
##   Ex   Fa   Gd None   Po   TA 
##   43   74  744 1420   46  592
all$FireplaceQu<-as.integer(plyr::revalue(all$FireplaceQu,Qualities))

##########

table(all$Fireplaces)
## 
##    0    1    2    3    4 
## 1420 1268  219   11    1
sum(is.na(all$Fireplaces))
## [1] 0
class(all$Fireplaces)
## [1] "numeric"

Imputing missing data: Basement

NaNum
## function(df){
##   sort(colSums(sapply(all[which(colSums(is.na(df))>0)],is.na)),decreasing = TRUE)
## }
length(which(is.na(all$BsmtQual)&is.na(all$BsmtCond)&is.na(all$BsmtExposure)&is.na(all$BsmtFinType1) &is.na(all$BsmtFinType2)))
## [1] 79
all[!is.na(all$BsmtFinType1) & (is.na(all$BsmtQual)|is.na(all$BsmtExposure)|is.na(all$BsmtFinType2)|is.na(all$BsmtCond)), c("BsmtQual","BsmtExposure","BsmtFinType2","BsmtCond")]
which(!is.na(all$BsmtFinType1) &(is.na(all$BsmtQual)|is.na(all$BsmtExposure)|is.na(all$BsmtFinType2)|is.na(all$BsmtCond)))
## [1]  333  949 1488 2041 2186 2218 2219 2349 2525
##So altogether, it seems as if there are 79 houses without a basement, because the basement variables of the other houses with missing values are all 80% complete (missing 1 out of 5 values). I am going to impute the modes to fix those 9 houses.

#Imputing modes.
all$BsmtFinType2[333]<-names(sort(-table(all$BsmtFinType2)))[1]
all$BsmtCond[c(2041,2186,2525)]<-names(sort(-table(all$BsmtCond)))[1]
all$BsmtExposure[c(949,1488,2349)]<-names(sort(-table(all$BsmtExposure)))[1]
all$BsmtQual[c(2218,2219)]<-names(sort(-table(all$BsmtQual)))[1]

######

all$BsmtQual[is.na(all$BsmtQual)]<-"None"
all$BsmtQual<-as.integer(plyr::revalue(all$BsmtQual,Qualities))
table(all$BsmtQual)
## 
##    0    2    3    4    5 
##   79   88 1285 1209  258
###
all$BsmtCond[is.na(all$BsmtCond)]<-'None'
all$BsmtCond<-as.integer(plyr::revalue(all$BsmtCond,Qualities))
table(all$BsmtCond)
## 
##    0    1    2    3    4 
##   79    5  104 2609  122
###
all$BsmtExposure[is.na(all$BsmtExposure)]<-'None'
table(all$BsmtExposure)
## 
##   Av   Gd   Mn   No None 
##  418  276  239 1907   79
Exposure<-c('None'=0,'No'=1, 'Mn'=2,'Gd'=3,'Av'=4)
all$BsmtExposure<-as.integer(plyr::revalue(all$BsmtExposure,Exposure))
###
all$BsmtFinType1[is.na(all$BsmtFinType1)]<-'None'
FinType<-c('None'=0,'Unf'=1,'LwQ'=2,'Rec'=3,'BLQ'=4,'ALQ'=5,'GLQ'=6)
all$BsmtFinType1<-as.integer(plyr::revalue(all$BsmtFinType1,FinType))
table(all$BsmtFinType1)
## 
##   0   1   2   3   4   5   6 
##  79 851 154 288 269 429 849
###
all$BsmtFinType2[is.na(all$BsmtFinType2)]<-'None'
all$BsmtFinType2<-as.integer(plyr::revalue(all$BsmtFinType2,FinType))
table(all$BsmtFinType2)
## 
##    0    1    2    3    4    5    6 
##   79 2494   87  105   68   52   34
all[(is.na(all$BsmtUnfSF)|is.na(all$BsmtFinSF2)|is.na(all$BsmtFinSF1)|is.na(all$BsmtHalfBath)|is.na(all$BsmtFullBath)|is.na(all$TotalBsmtSF)),c('BsmtUnfSF','BsmtFinSF2','BsmtFinSF1','BsmtHalfBath','BsmtFullBath','TotalBsmtSF')]
which(is.na(all$BsmtUnfSF)|is.na(all$BsmtFinSF2)|is.na(all$BsmtFinSF1)|is.na(all$BsmtHalfBath)|is.na(all$BsmtFullBath)|is.na(all$TotalBsmtSF))
## [1] 2121 2189
all[c(2121,2189),c('BsmtUnfSF','BsmtFinSF2','BsmtFinSF1','BsmtHalfBath','BsmtFullBath','TotalBsmtSF')]
table(all$TotalBsmtSF)
## 
##    0  105  160  173  190  192  216  240  245  264  270  279  290  297  301 
##   78    1    1    1    1    1    2    1    1    4    1    1    1    1    1 
##  319  346  348  352  356  360  370  372  381  384  385  392  396  405  407 
##    1    2    1    1    1    1    1    1    1   19    1    2    1    1    1 
##  408  409  416  420  423  430  432  440  448  450  451  456  458  462  468 
##    1    1    2    1    1    1    1    3    2    1    1    2    1    2    1 
##  480  481  482  483  484  485  492  494  498  502  504  506  516  520  525 
##    2    1    1   14    1    1    1    1    1    1    2    1    3    5    5 
##  526  528  530  531  533  536  539  540  544  546  547  550  551  552  554 
##    1    4    4    1    2    2    1    2    1   16    6    1    1    1    1 
##  559  560  561  565  569  570  572  576  583  585  588  592  596  600  602 
##    1    5    3    1    1    3    7    5    1    6    4    2    3   16    1 
##  608  610  611  612  616  617  621  622  624  625  626  628  629  630  631 
##    5    1    1    3    6    1    1    1   12    2    1    1    1   13    1 
##  635  636  637  641  644  648  649  650  651  654  655  656  658  660  661 
##    1    2    3    2    2    2    2    3    2    1    1    1    1    7    1 
##  662  663  666  671  672  673  675  676  677  678  680  683  684  686  687 
##    2    3    2    1   29    1    2    6    1    1    6    1    7    7    1 
##  689  690  691  693  696  697  698  699  700  702  704  707  708  709  710 
##    6    2    5    1    1    3    7    1    2    3    4    4    1    2    1 
##  712  713  714  715  716  718  720  721  723  725  727  728  729  731  732 
##    3    2    1    5    4    1   16    1    3    6    1   20    3    2    5 
##  734  735  736  738  739  740  741  742  744  745  746  747  750  752  754 
##    1    4    2    9    1    2    4    3    6    1    1    6    3    2    2 
##  755  756  757  760  761  763  764  765  768  770  771  773  774  776  777 
##    3   17    1    1    1    1    5    4   24    3    2    2    1    2    2 
##  778  779  780  781  782  783  784  788  789  791  792  793  794  795  796 
##    1    1   19    2    4    3   11    3    3    1    2    7    3    3   11 
##  797  798  799  800  801  802  803  804  806  807  808  810  811  812  813 
##    1    5    3    2    2    1    2    6    4    1    5    3    1    2    3 
##  814  815  816  817  818  819  821  822  824  825  826  827  828  830  831 
##    2    1   23    2    2    2    1    2    3    6    1    2    3    1    3 
##  832  833  835  836  837  839  840  841  842  844  845  846  847  848  849 
##   17    5    5    6    1    1   13    1    1    2    7    2    7   16    1 
##  850  851  852  853  854  855  856  858  859  860  861  862  864  866  868 
##    2    2    1    2    3    9    5    9    2    4    6    1   74    3    4 
##  869  870  872  873  874  876  878  879  880  882  883  884  886  888  890 
##    1    3    3    1    2   11    1    2    5    7    1   10    1    2    1 
##  891  892  893  894  895  896  900  901  902  903  904  907  908  910  911 
##    1    2    1   17    1    8    4    6    1    1    1    4    2    7    2 
##  912  913  914  915  916  917  918  920  922  923  924  925  926  927  928 
##   26    2    4    3    4    1    4    6    1    4    5    8    7    1    9 
##  929  930  931  932  935  936  938  939  940  941  942  943  944  945  946 
##    2    5    2    3    2   16    5    2    3    5    2    4    3    5    1 
##  948  949  950  951  952  953  954  955  956  957  958  959  960  961  963 
##    5    3    2    4   13    2    2    3    1    2    2    1   18    1    1 
##  964  965  966  967  968  969  970  971  972  973  974  975  976  978  979 
##    2    1    2    5    1    3    8    1    7    3    1    5    6    3    2 
##  980  981  982  983  984  985  988  989  990  991  992  993  994  995  996 
##    6    3    5    1    6    2   12    1    8    2    8    1    4    2    3 
##  998  999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1010 1012 1013 1014 
##    2    1    2    3    1    1    4    4    1    1   19    3    2    2    1 
## 1015 1017 1019 1020 1021 1022 1024 1025 1026 1027 1028 1029 1030 1031 1032 
##    2    2    1    3    1    5    5    1    5    2    2    5    1    1    5 
## 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 
##    1    1    1    2    1    1   25    3    2    2    2    2    1    1    3 
## 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 
##    2    5    4    5    4    7    4   10    6    4    2    2    1    2    3 
## 1064 1065 1066 1067 1068 1069 1070 1072 1073 1074 1075 1076 1077 1078 1079 
##    6    4    1    2    2    3    2    1    8    2    2    1    2    6    2 
## 1080 1081 1082 1083 1084 1085 1086 1088 1089 1090 1091 1092 1093 1094 1095 
##    7    1    3    1    2    1    3    4    1    5    1    7    2    5    3 
## 1096 1097 1098 1099 1100 1103 1104 1105 1106 1107 1108 1109 1112 1113 1114 
##    2    3    1    2    7    2    6    5    1    4    3    1    2    3    6 
## 1116 1117 1118 1120 1121 1122 1124 1125 1126 1127 1128 1129 1130 1131 1132 
##    2    5    1    3    3    4    3    1    2    4    4    1    1    1    1 
## 1134 1135 1136 1138 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 
##    2    1    2    3    4    5    4    5    6    5    1    1    3    2    4 
## 1151 1152 1153 1154 1156 1157 1158 1160 1161 1162 1163 1164 1165 1166 1168 
##    3    6    3    1    2    1    6    4    1    2    1    3    3    1    7 
## 1169 1170 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1184 1187 
##    2    1    1    2    1    3    6    1    1    2    1    3    1    2    2 
## 1188 1189 1190 1191 1192 1193 1194 1195 1196 1198 1199 1200 1202 1203 1204 
##    5    1    3    2    1    1    1    2    4    1    2    7    2    1    4 
## 1205 1206 1208 1209 1210 1211 1212 1214 1215 1216 1217 1218 1219 1220 1221 
##    1    1    7    2    1    1    4    4    1    7    1    5    1    5    1 
## 1222 1223 1224 1225 1226 1228 1230 1231 1232 1234 1235 1236 1237 1240 1241 
##    1    2    3    1    2    4    1    1    4    2    2    1    3    4    1 
## 1242 1243 1244 1246 1247 1248 1249 1250 1251 1252 1253 1254 1256 1257 1258 
##    4    1    3    3    1    6    2    1    2    2    1    1    4    2    3 
## 1259 1260 1261 1262 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 
##    1    2    1    2    2    1    2    2    3    1    1    1    2    2    1 
## 1276 1277 1278 1280 1281 1282 1284 1286 1288 1290 1291 1292 1293 1295 1296 
##    2    2    2    3    1    1    3    2    4    2    1    1    1    1    3 
## 1297 1298 1299 1300 1302 1304 1306 1308 1309 1310 1311 1312 1313 1314 1316 
##    1    2    1    3    6    2    3    1    1    3    1    2    4    4    2 
## 1317 1318 1319 1324 1325 1326 1328 1329 1330 1331 1332 1334 1335 1336 1337 
##    1    1    2    4    1    2    1    2    2    1    2    2    1    2    4 
## 1338 1339 1340 1341 1342 1344 1346 1347 1348 1349 1350 1351 1352 1356 1357 
##    3    1    1    2    4    4    2    2    2    1    4    1    2    1    1 
## 1358 1360 1361 1362 1363 1364 1365 1367 1368 1369 1370 1372 1373 1374 1375 
##    3    2    1    7    1    2    2    2    3    1    3    4    2    1    1 
## 1376 1377 1378 1379 1380 1381 1382 1383 1385 1386 1388 1389 1390 1391 1392 
##    1    1    1    1    1    1    1    2    1    2    1    2    4    2    6 
## 1393 1394 1395 1396 1398 1400 1401 1402 1405 1406 1408 1409 1410 1413 1414 
##    2    1    2    2    3    1    1    1    6    2    2    1    1    1    2 
## 1415 1416 1417 1418 1419 1420 1422 1424 1425 1426 1427 1428 1430 1431 1432 
##    1    1    2    2    2    4    4    1    2    1    2    1    3    2    1 
## 1433 1434 1436 1437 1438 1440 1441 1442 1444 1445 1449 1450 1451 1452 1453 
##    1    2    3    2    1    5    1    2    3    1    1    1    1    1    4 
## 1454 1455 1459 1460 1461 1462 1463 1466 1468 1470 1473 1475 1476 1477 1478 
##    2    1    1    1    3    3    2    3    4    3    1    1    1    1    3 
## 1479 1480 1482 1484 1485 1486 1487 1488 1489 1490 1491 1492 1494 1495 1496 
##    2    1    3    2    1    1    1    4    3    1    2    2    6    1    2 
## 1498 1499 1500 1501 1502 1504 1505 1508 1509 1510 1511 1512 1516 1517 1518 
##    3    1    1    3    2    3    1    2    1    3    1    1    1    2    3 
## 1519 1520 1522 1524 1525 1528 1529 1530 1531 1533 1536 1538 1540 1541 1542 
##    1    1    1    3    1    3    1    3    1    1    1    1    1    2    2 
## 1544 1546 1550 1552 1553 1554 1555 1556 1557 1559 1560 1561 1562 1563 1564 
##    1    2    1    1    1    2    1    2    1    1    5    2    1    1    1 
## 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1577 1578 1580 1581 
##    1    2    1    4    1    1    1    2    5    2    1    1    2    4    1 
## 1582 1584 1587 1588 1590 1592 1593 1594 1595 1596 1598 1600 1602 1603 1604 
##    4    3    2    2    1    1    1    5    3    2    2    1    3    1    3 
## 1606 1610 1612 1614 1615 1616 1617 1618 1620 1621 1622 1623 1624 1625 1626 
##    2    1    1    5    1    2    2    1    3    1    1    1    2    3    3 
## 1629 1630 1632 1638 1641 1642 1643 1645 1649 1650 1652 1654 1656 1657 1660 
##    2    1    3    2    1    1    3    1    1    2    3    1    2    2    2 
## 1664 1666 1670 1671 1673 1675 1678 1679 1680 1682 1683 1684 1685 1686 1689 
##    2    2    2    1    2    1    1    1    5    2    1    1    1    4    2 
## 1694 1696 1698 1700 1702 1704 1705 1706 1709 1710 1712 1713 1714 1719 1720 
##    4    2    3    2    2    2    2    1    1    3    1    1    1    1    2 
## 1721 1722 1726 1728 1732 1733 1734 1736 1738 1739 1740 1742 1746 1748 1751 
##    2    1    2    8    1    1    3    1    1    1    2    2    1    1    1 
## 1752 1753 1760 1763 1765 1766 1768 1774 1776 1777 1778 1779 1780 1782 1790 
##    1    2    1    1    1    1    1    1    2    1    1    1    1    2    1 
## 1792 1795 1800 1802 1803 1809 1814 1822 1824 1829 1832 1833 1836 1838 1840 
##    2    1    5    1    1    1    1    3    2    1    1    1    4    4    1 
## 1842 1844 1845 1848 1850 1851 1856 1858 1860 1865 1866 1868 1869 1884 1896 
##    1    3    1    2    2    1    1    1    1    1    1    2    1    3    1 
## 1898 1902 1905 1907 1910 1921 1922 1926 1930 1932 1934 1935 1947 1949 1950 
##    1    1    1    1    1    1    2    1    2    1    1    1    1    1    1 
## 1952 1958 1964 1966 1967 1976 1978 1980 1982 1986 1992 1994 2000 2002 2006 
##    1    1    1    1    1    1    2    1    1    1    1    1    1    2    1 
## 2014 2020 2024 2033 2035 2036 2042 2046 2048 2062 2076 2077 2078 2108 2109 
##    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1 
## 2110 2121 2136 2140 2153 2158 2171 2190 2200 2208 2216 2217 2220 2223 2271 
##    1    1    2    1    1    1    1    1    1    1    1    1    1    1    1 
## 2320 2330 2392 2396 2418 2444 2452 2458 2461 2492 2524 2535 2552 2630 2633 
##    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1 
## 2660 2846 3094 3138 3200 3206 5095 6110 
##    1    1    1    1    1    1    1    1
all$TotalBsmtSF[c(2121,2189)]<-0

table(all$BsmtFullBath)
## 
##    0    1    2    3 
## 1705 1172   38    2
all$BsmtFullBath[c(2121,2189)]<-0

table(all$BsmtHalfBath)
## 
##    0    1    2 
## 2742  171    4
all$BsmtHalfBath[c(2121,2189)]<-0

table(all$BsmtFinSF1)
## 
##    0    2   16   20   24   25   27   28   32   33   35   36   40   41   42 
##  929    1   14    8   27    1    1    5    1    1    1    4    3    1    1 
##   48   49   50   51   52   53   54   55   56   57   60   63   64   65   68 
##    4    1    2    1    2    1    2    1    4    2    4    1    1    1    3 
##   70   72   73   75   76   77   78   80   81   85   88   94   96  100  104 
##    2    1    1    1    1    1    1    4    2    1    1    1    1    1    5 
##  108  110  111  113  114  116  119  120  121  122  125  126  128  129  130 
##    1    3    2    1    3    3    3    6    2    1    1    1    1    1    3 
##  131  132  133  134  138  140  141  143  144  148  149  150  152  154  155 
##    1    1    2    1    3    1    1    1    5    1    1    3    2    1    1 
##  156  162  165  167  168  169  170  172  173  175  176  179  180  181  182 
##    3    2    1    1    5    1    1    1    2    1    3    1    3    1    3 
##  185  186  187  188  189  190  191  192  193  194  196  197  198  200  201 
##    1    1    2    2    3    2    1    4    1    2    5    1    2    1    1 
##  203  204  205  206  207  208  209  210  212  213  215  216  218  219  220 
##    3    1    1    1    1    1    2    4    1    1    1    3    1    1    3 
##  221  222  223  224  225  226  228  230  231  234  236  238  239  240  241 
##    1    1    1    3    1    2    2    1    2    3    2    1    1    3    2 
##  242  244  246  247  248  249  250  251  252  254  256  257  258  259  260 
##    1    1    2    4    2    2    5    1    5    1    3    3    1    2    2 
##  261  262  263  264  266  267  270  271  273  274  275  276  278  279  280 
##    1    2    1    3    2    2    1    1    2    2    2    6    1    1    5 
##  281  282  283  284  285  286  288  290  292  294  296  297  298  299  300 
##    2    2    2    3    2    1    8    2    3    3    1    3    2    4    9 
##  301  305  306  308  309  310  311  312  314  315  316  317  318  319  320 
##    2    2    3    3    1    3    3    6    1    3    1    2    1    4    2 
##  321  322  324  326  328  329  330  331  332  334  335  336  337  338  339 
##    2    1    2    2    1    4    5    2    1    2    1    5    1    4    1 
##  340  341  342  343  344  346  347  348  349  350  351  352  353  354  355 
##    3    3    2    2    1    1    1    2    1    4    3    2    5    2    1 
##  356  358  360  361  362  363  364  365  366  368  369  370  371  372  373 
##    1    2    7    2    1    1    1    1    1    6    1    3    3    2    1 
##  374  375  376  377  378  379  380  381  382  383  384  385  386  387  388 
##    2    7    1    1    4    3    2    3    1    1    8    2    2    2    2 
##  389  390  392  393  394  397  398  399  400  402  403  404  405  406  408 
##    1    4    2    1    1    3    1    3    5    2    4    1    2    2    4 
##  410  412  414  415  416  417  419  420  421  422  423  424  425  426  427 
##    4    1    3    2    2    2    2    3    2    1    1    1    4    2    1 
##  428  429  430  432  433  434  435  436  437  438  439  440  441  442  443 
##    4    1    1    5    1    1    3    1    3    2    1    1    2    4    1 
##  444  445  448  450  452  453  454  455  456  457  458  459  460  462  464 
##    2    3    2    5    1    2    3    1    7    3    2    1    3    3    1 
##  465  466  467  468  469  471  472  474  475  476  477  480  481  482  483 
##    1    1    1    6    1    2    2    3    1    4    1    4    1    1    4 
##  484  485  486  488  489  490  491  492  493  494  495  496  497  498  500 
##    2    2    3    1    2    4    1    3    2    3    4    1    1    2    7 
##  501  502  503  504  505  506  507  509  510  512  513  514  515  516  518 
##    1    1    3    6    3    2    4    2    4    2    3    1    1    3    1 
##  520  521  522  523  524  526  527  528  531  532  533  534  535  536  537 
##    4    2    3    2    2    1    3    6    2    3    3    2    2    2    1 
##  538  539  540  543  544  546  547  548  549  550  551  552  553  554  556 
##    1    4    3    1    6    4    6    3    4    5    1    3    6    2    1 
##  557  559  560  562  564  565  566  567  568  569  570  572  573  574  575 
##    1    1    6    1    2    4    2    2    2    3    3    1    1    3    2 
##  576  577  578  579  580  583  584  585  586  587  588  590  592  593  594 
##    2    1    3    1    2    2    2    2    1    1    6    1    1    2    3 
##  595  596  599  600  601  602  603  604  605  606  607  608  609  611  612 
##    5    1    2    8    1    7    3    2    1    2    1    2    2    2    4 
##  614  615  616  617  619  620  621  622  623  624  625  626  630  631  632 
##    1    1    5    2    2    1    2    2    2    7    6    4    2    1    3 
##  633  634  636  637  638  639  641  642  643  644  645  646  647  648  649 
##    4    1    2    5    2    2    4    1    3    3    1    3    1    5    2 
##  650  651  652  654  655  656  658  659  660  661  662  663  664  666  668 
##    2    2    1    2    4    4    3    4    4    1    6    4    2    3    2 
##  669  670  671  672  673  674  678  679  680  681  682  683  684  685  686 
##    1    1    1    5    2    3    2    2    3    2    2    1    1    4    5 
##  687  688  689  690  691  692  694  695  696  697  698  699  700  701  702 
##    1    1    1    3    1    1    1    3    1    5    3    2    7    1    1 
##  704  705  706  708  709  710  712  713  714  716  717  718  719  720  722 
##    4    2    3    2    1    1    3    1    1    2    2    1    2    2    1 
##  724  725  726  727  728  729  731  732  733  734  735  736  737  738  739 
##    2    1    1    2    3    2    1    5    3    4    2    2    2    2    3 
##  740  741  742  744  745  746  747  748  749  750  751  755  756  758  759 
##    2    2    1    2    2    1    4    2    1    1    1    3    1    5    1 
##  760  762  763  764  765  766  767  769  770  771  772  773  774  775  776 
##    2    3    2    1    2    2    5    3    2    1    1    3    2    3    3 
##  777  778  779  780  781  782  783  784  785  786  787  788  789  790  791 
##    2    3    3    3    4    1    1    5    2    3    2    4    3    2    2 
##  792  793  794  795  796  797  799  800  803  804  806  808  809  810  811 
##    1    2    1    1    2    1    2    3    2    2    1    1    1    2    1 
##  812  813  814  816  819  820  821  822  824  826  827  828  830  831  832 
##    4    2    2    4    2    3    2    2    2    1    1    4    1    2    3 
##  833  836  837  838  840  841  842  844  846  847  848  850  851  852  853 
##    2    4    1    1    1    4    1    1    3    2    1    2    4    1    1 
##  854  856  859  860  862  863  864  865  866  867  870  871  872  873  874 
##    1    3    1    2    1    1    4    2    3    1    3    2    3    1    1 
##  876  880  881  883  885  888  890  892  893  894  895  896  897  899  900 
##    2    1    1    1    1    1    2    1    1    2    1    1    1    1    1 
##  901  902  903  904  905  906  908  909  910  912  913  914  915  916  918 
##    2    4    2    5    1    1    1    1    2    1    2    1    5    1    1 
##  919  920  922  923  924  925  926  929  930  931  932  935  936  937  938 
##    1    3    2    2    1    2    1    2    2    2    2    1    7    1    2 
##  939  941  943  944  945  946  949  950  951  952  953  954  955  956  958 
##    1    2    1    2    2    1    1    1    1    1    1    1    1    2    2 
##  960  962  964  965  967  968  969  970  973  975  976  978  980  982  983 
##    1    3    3    1    2    1    1    1    1    2    2    1    2    1    1 
##  984  985  986  987  988  990  991  994  996  998 1000 1001 1002 1003 1004 
##    1    1    4    1    4    1    1    2    2    3    4    1    1    1    2 
## 1005 1010 1011 1012 1013 1014 1015 1016 1018 1021 1022 1023 1024 1026 1027 
##    3    1    1    2    1    1    1    2    2    1    1    3    2    1    1 
## 1029 1030 1032 1033 1034 1035 1036 1037 1038 1039 1040 1044 1046 1047 1048 
##    1    2    2    1    1    1    4    1    1    2    2    1    1    1    1 
## 1051 1053 1056 1059 1064 1065 1070 1071 1073 1074 1075 1078 1079 1080 1082 
##    1    2    3    4    1    2    2    1    1    1    1    2    1    1    2 
## 1084 1085 1086 1087 1088 1090 1092 1094 1096 1097 1098 1101 1104 1106 1110 
##    3    1    1    1    2    1    1    1    1    1    1    1    2    1    2 
## 1111 1112 1115 1116 1118 1121 1122 1124 1126 1127 1128 1129 1136 1137 1138 
##    1    1    1    3    1    1    1    1    1    1    1    1    1    1    1 
## 1141 1142 1148 1149 1150 1151 1152 1153 1154 1157 1158 1159 1162 1163 1165 
##    1    1    3    1    1    1    2    3    1    1    1    2    1    1    1 
## 1170 1172 1173 1178 1180 1181 1182 1186 1188 1191 1194 1196 1198 1200 1201 
##    1    1    3    1    1    1    1    1    1    1    1    2    2    5    3 
## 1204 1206 1213 1216 1218 1219 1220 1223 1224 1225 1230 1231 1232 1234 1236 
##    1    2    1    1    4    2    2    1    1    1    1    1    2    2    1 
## 1237 1238 1239 1243 1246 1247 1249 1252 1258 1259 1260 1261 1262 1270 1271 
##    1    1    1    1    2    1    3    1    2    1    1    1    1    1    1 
## 1274 1277 1280 1282 1283 1285 1288 1290 1294 1298 1300 1302 1304 1308 1309 
##    3    1    1    1    1    1    2    1    1    1    4    1    1    1    3 
## 1312 1314 1319 1320 1324 1328 1329 1330 1332 1333 1334 1336 1337 1338 1341 
##    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1 
## 1346 1350 1351 1358 1359 1360 1361 1369 1373 1375 1383 1386 1387 1390 1392 
##    1    2    1    1    1    2    1    2    3    1    2    2    2    1    1 
## 1398 1400 1406 1410 1412 1414 1416 1420 1422 1430 1433 1436 1440 1441 1443 
##    1    2    1    2    1    1    2    1    1    2    1    1    1    1    1 
## 1445 1447 1455 1456 1460 1464 1470 1474 1476 1478 1480 1500 1505 1513 1518 
##    1    1    2    1    1    1    1    1    2    1    1    1    1    1    2 
## 1531 1538 1540 1557 1558 1562 1564 1567 1571 1572 1573 1576 1593 1606 1619 
##    1    1    1    1    1    1    1    2    1    2    3    1    1    1    1 
## 1632 1636 1640 1646 1660 1682 1684 1696 1721 1728 1732 1733 1758 1767 1810 
##    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1 
## 1812 1836 1880 1904 1965 1972 2085 2096 2146 2158 2188 2257 2260 2288 4010 
##    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1 
## 5644 
##    1
all$BsmtFinSF1[c(2121,2189)]<-0

table(all$BsmtFinSF2)
## 
##    0    6   12   28   32   35   38   40   41   42   46   48   52   60   63 
## 2571    1    1    1    1    1    1    2    2    2    1    1    1    2    1 
##   64   66   68   72   76   78   80   81   92   93   95   96  102  105  106 
##    2    1    2    2    1    1    2    1    1    2    1    2    1    2    1 
##  108  110  113  116  117  119  120  121  123  125  127  128  132  136  138 
##    2    2    1    2    2    1    1    2    1    1    2    2    1    1    1 
##  139  144  147  149  150  153  154  156  159  162  163  165  167  168  169 
##    1    3    3    1    1    1    1    1    2    3    1    1    1    3    1 
##  173  174  175  177  180  181  182  184  186  193  196  201  202  206  208 
##    1    2    1    1    5    1    2    1    1    1    1    1    2    1    1 
##  210  211  215  216  219  227  228  230  232  239  240  243  247  250  252 
##    2    1    1    1    1    1    1    1    1    1    2    1    2    1    2 
##  258  259  262  263  264  270  273  276  278  279  281  284  286  287  288 
##    1    1    1    1    2    2    2    2    1    2    1    1    1    2    2 
##  290  294  297  306  308  311  319  321  324  334  336  337  344  345  350 
##    2    5    1    1    1    1    1    1    1    1    1    1    2    1    1 
##  351  352  354  360  362  364  373  374  375  377  380  382  387  391  393 
##    1    1    2    1    1    1    1    3    1    1    1    1    1    2    1 
##  396  398  400  402  404  411  417  419  420  432  435  438  441  442  448 
##    1    1    2    1    1    1    1    1    1    1    3    1    1    1    1 
##  449  450  453  456  465  466  468  469  472  474  479  480  483  486  488 
##    1    1    1    1    2    1    2    2    1    1    1    2    3    1    1 
##  491  492  495  497  499  500  506  507  512  522  529  530  531  532  539 
##    1    2    2    1    1    1    1    1    1    1    1    1    1    1    3 
##  543  544  546  547  551  555  557  580  590  596  600  604  606  608  612 
##    1    1    1    1    2    1    1    1    2    2    1    1    1    1    1 
##  613  619  620  622  624  627  630  634  645  661  668  670  679  682  684 
##    1    1    2    1    1    1    1    1    1    1    1    2    1    1    1 
##  688  690  691  692  694  712  713  722  723  748  750  755  761  764  768 
##    1    1    1    1    1    2    1    1    2    1    1    1    1    1    1 
##  774  791  799  811  820  823  826  829  831  841  842  850  852  859  869 
##    1    1    1    1    1    1    1    1    1    2    1    1    1    1    1 
##  872  873  875  884  891  904  906  912  915  955  956  972  981  982 1020 
##    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1 
## 1029 1031 1037 1039 1057 1061 1063 1073 1080 1083 1085 1120 1127 1164 1393 
##    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1 
## 1474 1526 
##    1    1
all$BsmtFinSF2[c(2121,2189)]<-0

table(all$BsmtUnfSF)
## 
##    0   14   15   17   20   22   23   25   26   27   28   29   30   32   33 
##  241    1    1    1    1    1    2    3    1    1    1    1    6    2    1 
##   34   35   36   38   39   40   42   45   46   48   50   52   53   54   55 
##    1    2    5    1    1    1    1    1    3    2    1    2    1    2    3 
##   56   57   58   60   61   63   64   70   72   74   75   76   77   78   79 
##    2    1    1    2    1    1    2    2    4    1    4    4    2    3    1 
##   80   81   82   83   84   86   88   89   90   91   92   93   94   95   96 
##    7    1    1    3    2    1    6    3    5    1    4    3    2    2    5 
##   98   99  100  102  103  104  105  106  107  108  109  110  111  112  113 
##    3    3   11    3    2    2    3    1    2    8    1    2    2    3    2 
##  114  115  117  118  119  120  121  122  123  124  125  126  127  128  129 
##    3    5    2    1    1    7    3    3    1    2    6    3    1    1    1 
##  130  131  132  133  134  135  136  137  138  140  141  142  143  144  147 
##    5    2    6    3    2    4    2    2    5    5    3    3    4    3    1 
##  148  149  150  151  153  154  155  156  158  160  161  162  163  164  165 
##    3    1    6    2    3    1    1    2    2    6    3    8    6    1    2 
##  166  167  168  169  170  171  172  173  174  175  176  177  178  179  180 
##    5    3    6    2    2    3    5    4    3    4    4    2    1    3    5 
##  181  182  183  184  185  186  187  188  189  190  191  192  193  194  195 
##    1    2    1    3    1    9    2    4    3    6    3    5    3    1    6 
##  196  197  198  200  201  203  204  206  207  208  210  212  213  214  215 
##    2    4    2    6    4    3    4    3    3    4    6    5    3    1    2 
##  216  217  218  219  220  221  222  223  224  225  226  227  228  229  230 
##   11    2    4    3    4    2    3    3    6    3    5    1    5    3    3 
##  231  232  233  234  235  236  237  238  239  240  241  242  243  244  245 
##    1    4    1    3    4    1    2    3    1    6    1    3    1    3    5 
##  246  247  248  249  250  251  252  253  254  255  256  257  258  260  261 
##    1    3    5    1    5    2    7    4    4    2    2    1    1    1    5 
##  262  263  264  265  266  268  269  270  271  272  273  274  275  276  277 
##    2    2    9    3    1    4    1   10    1    1    2    1    1    6    2 
##  278  279  280  281  282  284  285  286  288  289  290  291  292  293  294 
##    6    3    9    2    4    5    3    4    4    2    2    2    6    2    8 
##  295  296  297  298  299  300  301  303  304  305  306  307  308  309  310 
##    3    1    4    2    3   10    3    1    3    1    5    1    4    1    3 
##  311  312  313  314  315  316  317  318  319  320  321  322  323  324  325 
##    2    6    1    1    1    6    3    4    7    3    3    7    2    7    3 
##  326  327  328  329  330  331  332  333  334  335  336  338  339  340  341 
##    6    1    3    3    2    2    2    3    1    1    7    2    2    5    3 
##  342  343  344  345  346  348  349  350  352  354  355  356  357  358  359 
##    6    3    2    2    4    8    1    5    2    4    2    7    3    1    2 
##  360  361  362  363  364  365  366  367  369  370  371  372  374  375  378 
##    5    1    1    4    2    2    2    3    2    3    3    3    2    2    4 
##  379  380  381  382  383  384  385  386  387  388  389  390  392  393  394 
##    3    4    4    1    2   19    1    4    1    1    1    6    7    1    1 
##  395  396  397  398  399  400  402  403  404  405  406  407  408  409  410 
##    2    9    3    3    4    4    4    3    5    6    3    1    6    2    6 
##  411  412  413  414  415  416  417  418  420  422  423  424  425  426  427 
##    3    2    3    2    3    6    1    1    5    5    2    3    3    5    3 
##  428  429  430  431  432  433  434  435  436  437  438  439  440  441  442 
##    1    2    3    3    6    1    5    1    2    2    2    1    8    4    1 
##  443  444  445  447  448  449  450  451  452  453  454  455  456  457  458 
##    2    2    1    1    7    2    4    1    1    1    1    4    6    3    1 
##  459  460  461  462  463  464  465  466  467  468  469  470  471  472  473 
##    3    6    3    4    1    1    1    4    3    2    1    3    1    1    1 
##  474  475  476  477  479  480  481  482  483  484  485  486  487  488  490 
##    5    2    5    1    2    8    2    4    3    5    2    3    2    1    6 
##  491  494  495  496  497  498  499  500  501  502  503  504  505  506  507 
##    4    2    2    4    2    3    2    2    3    1    4    5    1    3    2 
##  508  510  512  513  515  516  517  519  520  521  522  523  524  525  526 
##    2    3    1    3    1    4    1    1    4    1    4    1    2    7    2 
##  527  528  529  530  532  533  534  535  536  537  538  539  540  541  542 
##    1    2    1    2    1    4    2    1    6    1    3    1    4    1    1 
##  543  544  545  546  547  548  549  550  551  552  554  555  556  557  559 
##    2    4    3    8    1    3    2    3    1    4    4    1    5    1    1 
##  560  561  563  564  565  567  568  569  570  571  572  574  575  576  577 
##    4    1    1    4    1    2    1    3    6    1   11    1    2    6    2 
##  578  579  580  581  583  584  585  586  587  588  589  590  591  592  593 
##    1    1    1    1    1    1    6    2    1    4    1    4    1    2    1 
##  594  595  596  598  599  600  602  604  605  606  608  610  611  612  613 
##    4    3    5    5    3   12    3    3    2    1    2    2    4    3    1 
##  615  616  618  619  620  621  622  624  625  626  628  630  631  632  634 
##    1    4    2    1    1    1    2   10    6    2    4    7    1    2    2 
##  635  636  637  638  639  640  641  643  644  645  646  647  648  649  650 
##    2    2    2    5    2    2    5    1    2    1    1    1    1    2    2 
##  651  652  655  656  657  658  659  660  661  662  663  664  666  667  668 
##    1    2    1    3    1    2    1    6    3    4    2    2    3    1    1 
##  671  672  673  674  675  676  677  678  679  680  683  684  686  689  690 
##    2   13    2    1    2    6    2    3    1    2    1    6    5    4    1 
##  691  692  693  697  698  700  701  702  704  705  706  707  708  709  710 
##    1    2    1    4    6    3    1    3    2    1    2    2    1    1    4 
##  712  713  715  716  717  718  720  721  722  723  724  725  726  727  728 
##    3    2    2    2    2    2    4    1    2    3    2    4    1    1   14 
##  729  730  731  732  733  734  735  736  738  739  740  741  742  744  745 
##    1    1    1    3    1    1    1    4    7    2    2    4    4    4    1 
##  746  747  748  749  750  752  755  756  758  760  761  762  763  764  765 
##    2    5    2    1    2    3    2    4    1    1    1    2    2    3    2 
##  767  768  769  770  771  772  773  774  776  777  778  779  780  781  782 
##    1    9    2    2    1    1    1    1    2    1    3    1    8    2    2 
##  783  784  785  788  789  791  792  793  794  795  796  797  798  799  801 
##    2    7    5    2    2    1    2    2    2    1    5    2    3    1    3 
##  802  803  804  806  808  809  810  811  812  813  814  815  816  817  818 
##    1    1    1    2    4    1    3    3    1    2    1    4   11    2    2 
##  819  821  823  825  827  828  830  831  832  833  834  835  836  840  841 
##    1    2    1    2    1    1    1    3    8    2    1    2    1    8    1 
##  844  845  846  847  848  849  850  851  852  856  858  859  860  861  862 
##    3    1    4    7    2    4    3    3    1    1    2    1    2    2    1 
##  863  864  866  868  869  871  872  873  874  876  877  878  879  880  882 
##    1    4    1    3    1    1    2    1    1    2    1    2    2    3    1 
##  884  888  889  891  892  893  894  896  897  898  899  901  903  905  906 
##    6    1    1    2    2    1    5    4    1    1    1    4    1    1    1 
##  907  908  910  912  914  916  917  918  920  924  925  926  927  928  929 
##    1    1    1    6    1    3    1    3    1    2    4    1    2    1    1 
##  930  931  932  935  936  938  939  940  941  944  945  948  949  951  952 
##    2    1    3    3    6    4    3    1    3    1    2    1    1    1    5 
##  953  957  958  960  961  963  964  966  967  968  969  970  971  972  974 
##    2    1    1    5    1    1    1    1    3    1    2    5    2    2    1 
##  975  976  977  978  982  983  988  989  992  993  994  995  996  998  999 
##    2    1    1    1    2    1    1    1    1    1    2    1    1    1    1 
## 1005 1007 1008 1010 1012 1013 1017 1018 1020 1022 1026 1028 1030 1032 1035 
##    1    1    5    1    2    1    3    2    2    1    2    1    1    1    2 
## 1040 1041 1042 1043 1045 1046 1048 1050 1052 1053 1054 1055 1057 1058 1063 
##    4    1    1    2    1    2    2    1    1    1    2    2    1    1    1 
## 1064 1065 1066 1068 1072 1073 1074 1075 1077 1078 1079 1081 1082 1084 1085 
##    3    2    1    1    1    1    2    1    2    1    1    1    2    1    2 
## 1087 1088 1090 1092 1093 1094 1095 1097 1098 1099 1100 1104 1105 1107 1108 
##    1    2    2    2    2    1    3    1    1    1    4    1    1    1    1 
## 1114 1115 1116 1117 1118 1120 1121 1122 1125 1126 1128 1129 1131 1139 1140 
##    3    1    1    2    1    1    1    1    1    1    2    3    1    2    3 
## 1141 1143 1144 1145 1146 1150 1152 1153 1158 1162 1163 1164 1168 1173 1176 
##    2    3    1    1    1    1    2    1    3    1    1    1    1    1    1 
## 1179 1180 1181 1184 1191 1194 1195 1198 1200 1203 1204 1209 1211 1212 1214 
##    1    1    2    1    2    1    2    2    1    1    2    1    1    1    1 
## 1216 1217 1218 1221 1226 1228 1232 1234 1237 1240 1242 1246 1248 1249 1250 
##    1    1    2    1    2    2    2    2    2    2    1    1    1    1    1 
## 1251 1252 1254 1257 1258 1262 1264 1266 1270 1272 1273 1276 1280 1284 1286 
##    1    1    1    2    3    1    1    2    1    1    1    2    1    1    2 
## 1288 1290 1293 1296 1298 1300 1302 1304 1307 1308 1313 1314 1316 1318 1319 
##    2    2    1    1    1    1    2    1    1    1    2    1    2    1    1 
## 1323 1324 1326 1327 1328 1330 1332 1335 1337 1339 1341 1342 1344 1346 1347 
##    1    1    1    1    1    3    1    2    1    2    1    2    1    3    1 
## 1348 1349 1351 1352 1357 1362 1367 1368 1369 1372 1373 1374 1375 1376 1380 
##    1    1    1    1    1    2    1    2    1    1    1    1    2    1    1 
## 1390 1392 1393 1400 1402 1404 1405 1406 1410 1411 1413 1417 1420 1421 1422 
##    2    1    1    1    1    1    2    1    1    1    1    1    1    1    1 
## 1424 1428 1430 1434 1436 1439 1442 1444 1450 1451 1453 1459 1461 1466 1468 
##    2    1    1    1    2    1    1    1    2    2    1    1    1    1    4 
## 1470 1473 1474 1480 1482 1486 1488 1489 1491 1494 1495 1496 1497 1498 1502 
##    1    1    1    1    1    1    1    2    2    1    1    3    1    2    1 
## 1503 1504 1508 1510 1515 1519 1526 1527 1528 1530 1541 1544 1550 1556 1558 
##    1    1    1    1    1    1    1    1    3    4    1    1    2    1    1 
## 1559 1560 1566 1568 1571 1573 1574 1580 1582 1584 1588 1590 1594 1595 1598 
##    1    4    1    2    1    1    1    2    1    3    2    1    2    2    1 
## 1602 1603 1604 1614 1615 1616 1618 1619 1622 1625 1626 1629 1630 1632 1638 
##    1    1    1    1    1    1    1    1    1    1    1    1    1    3    1 
## 1640 1643 1649 1652 1656 1660 1664 1670 1678 1680 1685 1686 1689 1694 1696 
##    1    2    1    1    2    1    1    1    1    2    1    1    2    3    1 
## 1704 1706 1709 1710 1721 1726 1728 1734 1736 1752 1753 1765 1768 1774 1777 
##    1    1    1    1    1    1    1    1    1    1    2    1    1    1    1 
## 1794 1795 1800 1802 1824 1836 1851 1866 1869 1905 1907 1921 1926 1935 1958 
##    1    1    3    1    2    2    1    1    1    1    1    1    1    1    1 
## 1967 1969 2002 2042 2046 2062 2121 2140 2153 2336 
##    1    1    1    1    1    1    1    1    1    1
all$BsmtUnfSF[c(2121,2189)]<-0


NaNum(all)
##   SalePrice  MasVnrType  MasVnrArea    MSZoning   Utilities  Functional 
##        1459          24          23           4           2           2 
## Exterior1st Exterior2nd  Electrical KitchenQual    SaleType 
##           1           1           1           1           1

Imputing missing data: masonry

NaNum(all)
##   SalePrice  MasVnrType  MasVnrArea    MSZoning   Utilities  Functional 
##        1459          24          23           4           2           2 
## Exterior1st Exterior2nd  Electrical KitchenQual    SaleType 
##           1           1           1           1           1
all$MasVnrArea[which(is.na(all$MasVnrType)&!is.na(all$MasVnrArea))]<-'None'


table(all$MasVnrType)
## 
##  BrkCmn BrkFace    None   Stone 
##      25     879    1742     249
table(all$MasVnrArea)
## 
##    0    1  100  101  102 1031  104 1047  105 1050  106  108  109 1095   11 
## 1738    3    5    3    2    1    4    1    2    1    7   11    1    1    1 
##  110 1110 1115  112 1129  113  114  115 1159  116  117 1170  118  119  120 
##    3    1    1    6    1    3    2    3    1    3    2    1    1    2   15 
##  121  122 1224  123  124  125  126  127  128 1290  130  132  134  135  136 
##    1    3    2    3    1    3    4    1    9    1    6    8    2    3    5 
##  137 1378  138   14  140  141  142  143  144  145  146  147  148  149  150 
##    1    1    2    4    7    1    2    6   11    6    2    2    5    4    5 
##  151  153  154  156  157  158   16  160 1600  161  162  163  164  165  166 
##    1    3    1    3    3    3   11    5    1    3    5    2    7    3    4 
##  167  168  169  170  171  172  174  175  176  177  178  179   18  180  182 
##    1    5    3    8    2    5    7    1   13    1    8    1    3   12    5 
##  183  184  186  187  188  189  190  192  194  196  197  198  199   20  200 
##    4    3    7    1    3    3    3    4    5    9    1    5    1    4   13 
##  202  203  204  205  206  207  208  209  210  212  214  215  216  217  218 
##    2    7    2    3    5    1    3    2    9    4    1    3   12    1    3 
##  219   22  220  221  222  223  224  225  226  227  228  229   23  230  232 
##    1    2    4    1    1    1    1    1    4    2    2    1    4    2    6 
##  233  234  235  236  237  238   24  240  242  243  244  245  246  247  248 
##    2    2    1    3    1    4    2    7    4    2    2    2    6    1    4 
##  250  251  252  253  254  255  256  257  258  259  260  261  262  263  264 
##    4    1    7    1    2    1    8    1    2    2    7    2    1    1    3 
##  265  266  268   27  270  272  274  275  276  278  279   28  280  281  283 
##    2    2    5    1    7    5    1    3    1    2    1    2    4    2    1 
##  284  285  286  287  288  289  290  291  292  293  294  295  296  297  298 
##    3    3    2    1    6    3    3    1    2    1    2    3    2    1    3 
##  299    3   30  300  302  304  305  306  308  309   31  310  312  315  318 
##    1    1    4    7    8    3    3    6    1    2    1    3    3    1    2 
##   32  320  322  323  324  327  328  332  333  335  336  337  338   34  340 
##    4    7    1    1    1    1    2    1    1    2    4    1    2    1   10 
##  342  344  348  350  351  352  353  355  356  359   36  360  361  362  364 
##    2    2    1    3    2    2    1    1    2    2    2    7    1    2    2 
##  365  366  368  370  371  372  375  376  378  379   38  380  381  382  383 
##    2    2    2    1    1    1    1    1    2    1    2    2    1    1    2 
##  385  387  388   39  391  394  396  397  399   40  400  402  405  406  408 
##    1    1    1    1    1    1    1    1    1    8    1    2    1    1    1 
##   41  410  412  415  418   42  420  422  423  424  425  426  428  430  432 
##    3    2    1    1    1    3    7    2    3    2    3    1    1    2    2 
##  434  435  436  438   44  440  442  443  444  448   45  450  451  452  456 
##    1    1    1    1    7    1    3    1    1    1    3    4    1    1    7 
##  459   46  464  466  468   47  470  472  473  479   48  480  481  491  492 
##    1    1    1    3    2    1    1    3    3    1    1    4    1    1    2 
##  495   50  500  501  502  504  506  509   51  510  513  514  515  518  519 
##    1    7    2    1    1    6    2    1    3    2    5    1    1    1    1 
##   52  522  525  526  528   53  530  532   54  541  549  550  554   56  562 
##    3    1    2    1    1    2    1    1    4    1    1    1    3    2    1 
##  564  567  568   57  571  572  573  576  579   58  584  594   60  600  603 
##    1    2    2    1    1    1    1    1    1    2    1    1    7    3    1 
##  604  615  616   62  621   63  630  632  634   64  640  647   65  650  651 
##    1    1    1    1    2    1    1    2    1    1    1    1    2    2    1 
##  652  653  657   66  660  662  664  668   67  673  674   68  680   69  692 
##    1    1    1    2    2    1    1    1    2    1    2    5    1    1    1 
##   70  705  710  714   72  724  726  730  731  734  738   74  748   75  754 
##    4    1    1    1   11    1    1    1    1    1    1    4    1    2    1 
##   76  760  762  766  768  771  772  788  796   80   81  816   82   84   85 
##    7    1    1    1    1    1    1    1    1    9    1    1    5    7    4 
##   86  860   87  870  877   88  886   89  894   90  902   91   92  921  922 
##    3    1    1    1    1    5    1    2    1    6    1    1    2    1    1 
##   94  945   95   96   97  970  975   98   99 None 
##    4    1    3    4    1    1    1    5    4    1
all$MasVnrArea[which(is.na(all$MasVnrArea))]<-0
all$MasVnrType[which(is.na(all$MasVnrType))]<-'None'

table(all$MasVnrType)
## 
##  BrkCmn BrkFace    None   Stone 
##      25     879    1766     249
detach("package:plyr", unload=TRUE)
all[!is.na(SalePrice), ]%>%group_by(MasVnrType)%>%
  summarise(median=median(SalePrice),count=n())%>%arrange(median)
#由此可知,none和BrkCmn對價錢沒什麼影響力,BrkFace和Stone則影響著價錢
#有order, 化作integer

Masonry<-c('BrkCmn'=0,'None'=0,'BrkFace'=1,'Stone'=2)
library(plyr)
all$MasVnrType<-as.integer(plyr::revalue(all$MasVnrType,Masonry))

Imputing missing data:MsZoning

table(all$MSZoning)
## 
## C (all)      FV      RH      RL      RM 
##      25     139      26    2265     460
sum(is.na(all$MSZoning))
## [1] 4
all$MSZoning[is.na(all$MSZoning)]<-names(sort(-table(all$MSZoning),increasing=TRUE))[1]
all$MSZoning<-as.factor(all$MSZoning)
sum(table(all$MSZoning))
## [1] 2919

Imputing missing data:kitchen

NaNum(all)
##   SalePrice   Utilities  Functional Exterior1st Exterior2nd  Electrical 
##        1459           2           2           1           1           1 
## KitchenQual    SaleType 
##           1           1
table(all$KitchenQual)
## 
##   Ex   Fa   Gd   TA 
##  205   70 1151 1492
all$KitchenQual[is.na(all$KitchenQual)]<-names(sort(-table(all$KitchenQual),increasing=TRUE))[1]
all$KitchenQual<-as.integer(plyr::revalue(all$KitchenQual,Qualities))
####
class(all$KitchenAbvGr)
## [1] "numeric"

Imputing missing data:Utilities

NaNum(all)
##   SalePrice   Utilities  Functional Exterior1st Exterior2nd  Electrical 
##        1459           2           2           1           1           1 
##    SaleType 
##           1
table(Utilities)
## Utilities
## AllPub NoSeWa 
##   2916      1
all$Utilities[is.na(all$Utilities)]<-names(sort(-table(all$Utilities),increasing=TRUE))[1]
#However, the table below shows that only one house does not have all public utilities. This house is in the train set. Therefore, imputing ‘AllPub’ for the NAs means that all houses in the test set will have ‘AllPub’. This makes the variable useless for prediction. Consequently, I will get rid of it.

kable(all[is.na(all$Utilities)|all$Utilities=='NoSeWa',1:9])
MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities
20 RL 82 14375 Pave None 2 Lvl NoSeWa
all$Utilities<- NULL

##當整個column的value都一樣時,這個column就沒用了

Imputing missing data: HomeFunctionality

NaNum(all)
##   SalePrice  Functional Exterior1st Exterior2nd  Electrical    SaleType 
##        1459           2           1           1           1           1
table(all$Functional)
## 
## Maj1 Maj2 Min1 Min2  Mod  Sev  Typ 
##   19    9   65   70   35    2 2717
all$Functional[is.na(all$Functional)]<-names(sort(-table(all$Functional),increasing=TRUE))[1]
all$Functional<-as.integer(plyr::revalue(all$Functional, c('Sal'=0,'Sev'=1, 'Maj2'=2, 'Maj1'=3, 'Mod'=4,'Min2'=5, 'Min1'=6,'Typ'=7)))

Imputing missing data:Exterior

NaNum(all)
##   SalePrice Exterior1st Exterior2nd  Electrical    SaleType 
##        1459           1           1           1           1
table(all$ExterQual)
## 
##   Ex   Fa   Gd   TA 
##  107   35  979 1798
all$ExterQual<-as.integer(plyr::revalue(all$ExterQual, Qualities))
###
table(all$ExterCond)
## 
##   Ex   Fa   Gd   Po   TA 
##   12   67  299    3 2538
all$ExterCond<-as.integer(plyr::revalue(all$ExterCond, Qualities))
###
table(all$Exterior2nd)
## 
## AsbShng AsphShn Brk Cmn BrkFace  CBlock CmentBd HdBoard ImStucc MetalSd 
##      38       4      22      47       3     126     406      15     447 
##   Other Plywood   Stone  Stucco VinylSd Wd Sdng Wd Shng 
##       1     270       6      47    1014     391      81
all$Exterior2nd[is.na(all$Exterior2nd)]<-names(sort(-table(all$Exterior2nd),increasing=TRUE))[1]
###
table(all$Exterior1st)
## 
## AsbShng AsphShn BrkComm BrkFace  CBlock CemntBd HdBoard ImStucc MetalSd 
##      44       2       6      87       2     126     442       1     450 
## Plywood   Stone  Stucco VinylSd Wd Sdng WdShing 
##     221       2      43    1025     411      56
all$Exterior1st[is.na(all$Exterior1st)]<-names(sort(-table(all$Exterior1st),increasing=TRUE))[1]

Imputing missing data:Electrical

NaNum(all)
##  SalePrice Electrical   SaleType 
##       1459          1          1
table(all$Electrical)
## 
## FuseA FuseF FuseP   Mix SBrkr 
##   188    50     8     1  2671
all$Electrical[is.na(all$Electrical)]<-names(sort(-table(all$Electrical),incresing=TRUE))[1]

Imputing missing data:SaleType

NaNum(all)
## SalePrice  SaleType 
##      1459         1
table(all$SaleType)
## 
##   COD   Con ConLD ConLI ConLw   CWD   New   Oth    WD 
##    87     5    26     9     8    12   239     7  2525
all$SaleType[is.na(all$SaleType)]<-names(sort(-table(all$SaleType),increasing=TRUE))[1]
###
table(all$SaleCondition)
## 
## Abnorml AdjLand  Alloca  Family  Normal Partial 
##     190      12      24      46    2402     245
detach("package:plyr", unload=TRUE)
all[!is.na(all$SalePrice),]%>%group_by(SaleCondition)%>%
  summarise(median=median(SalePrice),count=n())%>%arrange(median)
all$SaleType<-as.factor(all$SaleType)
#思考一下要不要換成ordinal
Label encoding/factorizing the remaining chactogrical variables
##還有幾個variable是charactor
Charcol<- names(all[,sapply(all,is.character)])
##寫成function
ChaNum<-function(df){
  names(df[,sapply(df,is.character)])
}

cat('There are', length(Charcol), 'remaining catagorical variables in the data')
## There are 21 remaining catagorical variables in the data
Charcol
##  [1] "Street"        "LandContour"   "LotConfig"     "LandSlope"    
##  [5] "Neighborhood"  "Condition1"    "Condition2"    "BldgType"     
##  [9] "HouseStyle"    "RoofStyle"     "RoofMatl"      "Exterior1st"  
## [13] "Exterior2nd"   "MasVnrArea"    "Foundation"    "Heating"      
## [17] "HeatingQC"     "CentralAir"    "Electrical"    "PavedDrive"   
## [21] "SaleCondition"

Factorizing character: Foundation

all$Foundation<-as.factor(all$Foundation)
table(all$Foundation)
## 
## BrkTil CBlock  PConc   Slab  Stone   Wood 
##    311   1235   1308     49     11      5
sum(table(all$Foundation))
## [1] 2919

Factorizing character: Heating and airco

all$Heating<-as.factor(all$Heating)
table(all$Heating)
## 
## Floor  GasA  GasW  Grav  OthW  Wall 
##     1  2874    27     9     2     6
sum(table(all$Heating))
## [1] 2919
###
all$HeatingQC<-as.integer(plyr::revalue(all$HeatingQC,Qualities))
table(all$HeatingQC)
## 
##    1    2    3    4    5 
##    3   92  857  474 1493
sum(table(all$HeatingQC))
## [1] 2919
###
#可以化為布林,不要放過
all$CentralAir<-as.integer(plyr::revalue(all$CentralAir,c('N'=0,'Y'=1)))
table(all$CentralAir)
## 
##    0    1 
##  196 2723
sum(table(CentralAir))
## [1] 2919

Factorizing character: Roof

ChaNum(all)
##  [1] "Street"        "LandContour"   "LotConfig"     "LandSlope"    
##  [5] "Neighborhood"  "Condition1"    "Condition2"    "BldgType"     
##  [9] "HouseStyle"    "RoofStyle"     "RoofMatl"      "Exterior1st"  
## [13] "Exterior2nd"   "MasVnrArea"    "Electrical"    "PavedDrive"   
## [17] "SaleCondition"
all$RoofMatl<-as.factor(all$RoofMatl)
table(all$RoofMatl)
## 
## ClyTile CompShg Membran   Metal    Roll Tar&Grv WdShake WdShngl 
##       1    2876       1       1       1      23       9       7
sum(table(all$RoofMatl))
## [1] 2919
###
all$RoofStyle<-as.factor(all$RoofStyle)
table(all$RoofStyle)
## 
##    Flat   Gable Gambrel     Hip Mansard    Shed 
##      20    2310      22     551      11       5
length(all$RoofStyle)
## [1] 2919

Factorizing character: Land,neighborhood

ChaNum(all)
##  [1] "Street"        "LandContour"   "LotConfig"     "LandSlope"    
##  [5] "Neighborhood"  "Condition1"    "Condition2"    "BldgType"     
##  [9] "HouseStyle"    "Exterior1st"   "Exterior2nd"   "MasVnrArea"   
## [13] "Electrical"    "PavedDrive"    "SaleCondition"
##不能像下面這樣寫(一次把所有的column叫出來factorize),會把全部都變成intweger(查一下為什麼!!!)
####str(all[,c("LandContour","LandSlope","Neighborhood")])
####all[,c("LandContour","LandSlope","Neighborhood")]<-c(as.factor(all$LandContour),as.factor(all$LandSlope),as.factor(all$Neighborhood))
####str(all[,c("LandContour","LandSlope","Neighborhood")])


table(all$LandContour)
## 
##  Bnk  HLS  Low  Lvl 
##  117  120   60 2622
ContourType<-c('Bnk'=1,'HLS'=2,'Low'=3,'Lvl'=4)
all$LandContour<-as.integer(plyr::revalue(all$LandContour,ContourType))
sum(table(all$LandContour))
## [1] 2919
###
table(all$LandSlope)
## 
##  Gtl  Mod  Sev 
## 2778  125   16
all$LandSlope<-as.integer(plyr::revalue(all$LandSlope,c('Sev'=0,'Mod'=1,'Gtl'=2)))
sum(table(all$LandSlope))
## [1] 2919
###
table(all$Neighborhood)
## 
## Blmngtn Blueste  BrDale BrkSide ClearCr CollgCr Crawfor Edwards Gilbert 
##      28      10      30     108      44     267     103     194     165 
##  IDOTRR MeadowV Mitchel   NAmes NoRidge NPkVill NridgHt  NWAmes OldTown 
##      93      37     114     443      71      23     166     131     239 
##  Sawyer SawyerW Somerst StoneBr   SWISU  Timber Veenker 
##     151     125     182      51      48      72      24
all$Neighborhood<-as.factor(all$Neighborhood)
sum(table(all$Neighborhood))
## [1] 2919
class(all$Neighborhood)
## [1] "factor"
###
table(all$Condition1)
## 
## Artery  Feedr   Norm   PosA   PosN   RRAe   RRAn   RRNe   RRNn 
##     92    164   2511     20     39     28     50      6      9
all$Condition1<-as.factor(all$Condition1)
###
table(all$Condition2)
## 
## Artery  Feedr   Norm   PosA   PosN   RRAe   RRAn   RRNn 
##      5     13   2889      4      4      1      1      2
all$Condition2<-as.factor(all$Condition2)

Factorizing character: Dwelling

ChaNum(all)
##  [1] "Street"        "LotConfig"     "BldgType"      "HouseStyle"   
##  [5] "Exterior1st"   "Exterior2nd"   "MasVnrArea"    "Electrical"   
##  [9] "PavedDrive"    "SaleCondition"
table(all$BldgType)
## 
##   1Fam 2fmCon Duplex  Twnhs TwnhsE 
##   2425     62    109     96    227
##注意,給定絕對路徑後,x,y就不會和指定的data掛鉤,這樣是不行的:eg: ggplot(all[!is.na(SalePrice),],aes(all$BldgType, all$SalePrice))
ggplot(all[!is.na(SalePrice),],aes(BldgType, SalePrice))+
  geom_bar(stat='summary', fun.y='median',fill='blue')+
  scale_y_continuous(breaks=seq(0,200000,50000),labels=comma)+
  geom_label(stat='count',aes(label=..count..,y=..count..))

all[!is.na(SalePrice),]%>%group_by(BldgType)%>%summarise(median=median(SalePrice), count=n())%>%arrange(median)
##no ordinality???
all$BldgType<- as.factor(all$BldgType)
class(all$BldgType)
## [1] "factor"

Factorizing character: Pavement of street and drive way

table(all$Street)
## 
## Grvl Pave 
##   12 2907
all$Street<-as.integer(plyr::revalue(all$Street,c('Grvl'=0,'Pave'=1)))
##
table(all$PavedDrive)
## 
##    N    P    Y 
##  216   62 2641
all$PavedDrive<-as.integer(plyr::revalue(all$PavedDrive, c('N'=0,'P'=1,'Y'=2)))
At this point, all variables are complete (No NAs), and all character variables are converted into either numeric labels of into factors. However, there are 3 variables that are recorded numeric but should actually be categorical.

Changing some numeric variables into factors:Year and Month Sold

table(all$YrSold)
## 
## 2006 2007 2008 2009 2010 
##  619  692  622  647  339
str(all$YrSold)
##  num [1:2919] 2008 2007 2008 2006 2008 ...
table(all$MoSold)
## 
##   1   2   3   4   5   6   7   8   9  10  11  12 
## 122 133 232 279 394 503 446 233 158 173 142 104
str(all$MoSold)
##  num [1:2919] 2 5 9 2 12 10 8 11 4 1 ...
all$MoSold<-as.factor(all$MoSold)
###
#Although possible a bit less steep than expected, the effects of the Banking crises that took place at the end of 2007 can be seen indeed. After the highest median prices in 2007, the prices gradually decreased. However, seasonality seems to play a bigger role, as you can see below.



ys <- ggplot(all[!is.na(all$SalePrice),], aes(x=as.factor(YrSold), y=SalePrice)) +
        geom_bar(stat='summary', fun.y = "median", fill='blue')+
        scale_y_continuous(breaks= seq(0, 800000, by=25000), labels = comma) +
        geom_label(stat = "count", aes(label = ..count.., y = ..count..)) +
        coord_cartesian(ylim = c(0, 200000)) +
        geom_hline(yintercept=163000, linetype="dashed", color = "red") #dashed line is median SalePrice

ys<-ggplot(all[!is.na(all$SalePrice),],aes(as.factor(YrSold),SalePrice))+
  geom_bar(stat='summary',fun.y='median',fill='blue')+xlab('YrSold')+
  scale_y_continuous(breaks=seq(0,800000,25000),labels=comma)+
  geom_label(stat='count',aes(label=..count..,y=..count..))+
  coord_cartesian(ylim=c(0,200000))+
  geom_hline(yintercept=163000,linetype='dashed',color='red')
#?coord_cartesian

ms<-ggplot(all[!is.na(all$SalePrice),],aes(as.factor(MoSold),SalePrice))+
  geom_bar(stat="summary", fun.y='median',fill='blue')+xlab('MoSold')+
  scale_y_continuous(breaks = seq(0,800000,25000),labels=comma)+
  geom_label(stat='count', aes(label=..count..,y=..count..))+
  coord_cartesian(ylim=c(0,200000))+
  geom_hline(yintercept=163000,linetype='dashed',color='red')
##stat=summary的功能類似group by + summary, fuy.y表示要在y做什麼事,所以就是group by x軸,針對subset 做fun.y的計算
##這裡是做median,比較下面這行的結果和圖中1月的值
#median(all$SalePrice[all$MoSold==1&!is.na(all$SalePrice)])

grid.arrange(ys,ms,widths=c(1,2))

##grid.arrange把兩張圖一起陳列,width=c(1,2)表示陳列為一個row,兩個col(亦即擺在同一行裡面看,不要分兩行)

#####房價被2007年的次貸風暴影響,但看似月份影響比較大
Changing some numeric variables into factors:MSSubClass
table(all$MSSubClass)
## 
##   20   30   40   45   50   60   70   75   80   85   90  120  150  160  180 
## 1079  139    6   18  287  575  128   23  118   48  109  182    1  128   17 
##  190 
##   61
str(all$MSSubClass)
##  num [1:2919] 60 20 60 70 60 50 20 60 50 190 ...
all$MSSubClass<-as.factor(all$MSSubClass)
all$MSSubClass<-plyr::revalue(all$MSSubClass,c('20'='1 story 1946+', '30'='1 story 1945-', '40'='1 story unf attic', '45'='1,5 story unf', '50'='1,5 story fin', '60'='2 story 1946+', '70'='2 story 1945-', '75'='2,5 story all ages', '80'='split/multi level', '85'='split foyer', '90'='duplex all style/age', '120'='1 story PUD 1946+', '150'='1,5 story PUD all', '160'='2 story PUD 1946+', '180'='PUD multilevel', '190'='2 family conversion'))

Visualization of important variables

# I have now finally reached the point where all character variables have been converted into categorical factors or have been label encoded into numbers. In addition, 3 numeric variables have been converted into factors, and I deleted one variable (Utilities). As you can see below, the number of numerical variables is now 56 (including the response variable), and the remaining 23 variables are categorical.

numericVars<-which(sapply(all,is.numeric))
factorVars<-which(sapply(all,is.factor))
cat('There are', length(numericVars), 'numeric variables, and', length(factorVars), 'categoric variables')
## There are 56 numeric variables, and 16 categoric variables

Correlations again

all_numVar<-all[,numericVars]
cor_numVar<-cor(all_numVar,use='pairwise.complete.obs')
cor_sorted<-as.matrix(sort(cor_numVar[,"SalePrice"],decreasing = TRUE))
CorHigh<-names(which(apply(cor_sorted,1,function(x) abs(x)>0.5)))
cor_numVar<-cor_numVar[CorHigh,CorHigh]
corrplot.mixed(cor_numVar,tl.col='black',tl.pos='lt',tl.cex=0.7,cl.cex=0.7,number.cex=0.7)

#?corrplot.mixed

Finding variable importance with a quick Random Forest

# Although the correlations are giving a good overview of the most important numeric variables and multicolinerity among those variables, I wanted to get an overview of the most important variables including the categorical variables before moving on to visualization.
# I tried to get the relative importance of variables with a quick linear regression model with the calc.relimp function of package , and also tried the boruta function of package boruta which separates the variables into groups that are important or not. However, these method took a long time. As I only want to get an indication of the variable importance, I eventually decided to keep it simple and just use a quick and dirty Random Forest model with only 100 trees. This also does the job for me, and does not take very long as I can specify a (relatively) small number of trees.

nrow(all)
## [1] 2919
###facotrize all character,不然不能random forest
ChaNum(all)
## [1] "LotConfig"     "HouseStyle"    "Exterior1st"   "Exterior2nd"  
## [5] "MasVnrArea"    "Electrical"    "SaleCondition"
all$MSZoning<-as.factor(all$MSZoning)
all$LotConfig<-as.factor(all$LotConfig)
all$HouseStyle<-as.factor(all$HouseStyle)
all$Exterior1st<-as.factor(all$Exterior1st)
all$Exterior2nd<-as.factor(all$Exterior2nd)
all$Electrical<-as.factor(all$Electrical)
all$SaleCondition<-as.factor(all$SaleCondition)


###下面這邊,要查
set.seed(2018)
quick_RF <- randomForest(x=all[1:1460,-79], y=all$SalePrice[1:1460], ntree=100,importance=TRUE)
imp_RF<-importance(quick_RF)
imp_DF<-data.frame(Variables=row.names(imp_RF),MSE=imp_RF[,1])
imp_DF<-imp_DF[order(imp_DF$MSE,decreasing=TRUE),]
  
ggplot(imp_DF[1:20,],aes(reorder(Variables,MSE),MSE,fill=MSE))+
  geom_bar(stat='identity')+labs(x='Variables',y='% increase MSE if Variable is randomly permuted')+
  coord_flip()+theme(legend.position='none')

# Only 3 of those most important variables are categorical according to RF; Neighborhood, MSSubClass, and GarageType.

Overall Quality, and other Quality variables

q1<-ggplot(all,aes(as.factor(OverallQual)))+
  geom_histogram(stat='count')
q2<-ggplot(all,aes(as.factor(ExterQual)))+
  geom_histogram(stat='count')
q3<-ggplot(all,aes(as.factor(BsmtQual)))+
  geom_histogram(stat='count')
q4<-ggplot(all,aes(as.factor(KitchenQual)))+
  geom_histogram(stat='count')
q5<-ggplot(all,aes(as.factor(GarageQual)))+
  geom_histogram(stat = 'count')
q6<-ggplot(all,aes(as.factor(FireplaceQu)))+
  geom_histogram(stat='count')
q7<-ggplot(all,aes(as.factor(PoolQC)))+
  geom_histogram(stat='count')

layout <- matrix(c(1,2,8,3,4,8,5,6,7),3,3,byrow=TRUE)
Rmisc::multiplot(q1,q2,q3,q4,q5,q6,q7,layout = layout)

##layout要給matrix,顯示方式就是照matrix的排列來排,這裏layout放8在中間是因為放一個空的圖,因為有些row要放兩個,有些row要放三個


#Overall Quality is very important, and also more granular than the other variables. External Quality is also improtant, but has a high correlation with Overall Quality (0.73). Kitchen Quality also seems one to keep, as all houses have a kitchen and there is a variance with some substance. Garage Quality does not seem to distinguish much, as the majority of garages have Q3. Fireplace Quality is in the list of high correlations, and in the important variables list. The PoolQC is just very sparse (the 13 pools cannot even be seen on this scale). I will look at creating a ‘has pool’ variable later on.

The second most important categorical variable; MSSubClass

# The first visualization shows the median SalePrice by MSSubClass. The frequency (number of houses) of each MSSubClass in the train set is shown in the labels.
# The histrogram shows the frequencies across all data. Most houses are relatively new, and have one or two stories.

ms1<-ggplot(all[!is.na(all$SalePrice),],aes(MSSubClass,SalePrice))+
  geom_bar(stat='summary',fun.y='median',fill='blue')+
  theme(axis.text.x=element_text(angle=45,hjust=1))+
  scale_y_continuous(breaks=seq(0,800000,50000),labels=comma)+
  geom_label(stat='count',aes(label=..count..,y=..count..),size=3)+
  geom_hline(yintercept=163000,linetype='dashed',color='red')#dashed line is median SalePrice


ms2<-ggplot(all,aes(MSSubClass))+
  geom_histogram(stat='count')+
  geom_label(stat='count',aes(label=..count..,y=..count..),size=3)+
  theme(axis.text.x=element_text(angle = 45,hjust=1))
grid.arrange(ms1,ms2)

Garage variables
# Several Garage variables have a high correlation with SalePrice, and are also in the top-20 list of the quick random forest. However, there is multicolinearity among them and I think that 7 garage variables is too many anyway. I feel that something like 3 variables should be sufficient (possibly GarageCars, GarageType, and a Quality measurement), but before I do any selection I am visualizing all of them in this section.

#correct error
all$GarageType[2593]<-2007
#this must have been a typo. GarageYrBlt=2207, YearBuilt=2006, YearRemodAdd=2007.

g1<-ggplot(all[all$GarageCars!=0,],aes(GarageYrBlt))+geom_histogram()
g2<-ggplot(all,aes(as.factor(GarageCars)))+geom_histogram(stat='count')
g3<-ggplot(all,aes(GarageArea))+geom_density()
g4<-ggplot(all,aes(as.factor(GarageCond)))+geom_histogram(stat='count')
g5<-ggplot(all,aes(GarageType))+geom_histogram(stat='count')
g6<-ggplot(all,aes(as.factor(GarageQual)))+geom_histogram(stat='count')
g7<-ggplot(all,aes(as.factor(GarageFinish)))+geom_histogram(stat='count')

layout<- matrix(c(1,5,5,2,3,8,6,4,7),3,3,byrow=TRUE)
Rmisc::multiplot(g1,g2,g3,g4,g5,g6,g7,layout=layout)

##連續兩格放同一個圖,會變成一張圖佔兩格空間

# Similar the garage variables, multiple basement variables are important in the correlations matrix and the Top 20 RF predictors list. However, 11 basement variables seems an overkill. Before I decide what I am going to do with them, I am visualizing 8 of them below. The 2 “Bathroom” variables are dealt with in Feature Engineering (section 7.1), and the “Basement square feet” is already discussed in section 6.2.1.

b1 <- ggplot(all,aes(BsmtFinSF1))+geom_histogram()+labs(x='Type 1 finished square feet')
b2 <- ggplot(all,aes(BsmtFinSF2))+geom_histogram()+labs(x='Type 2 finished square feet')
b3 <- ggplot(all,aes(BsmtUnfSF))+geom_histogram()+labs(x='Unfinished square feet')
b4 <- ggplot(all,aes(as.factor(BsmtFinType1)))+geom_histogram(stat='count')+labs(x='Rating of Type 1 finished area')
b5 <- ggplot(all,aes(as.factor(BsmtFinType2)))+geom_histogram(stat='count')+labs(x='Rating of Type 2 finished area')
b6 <- ggplot(all,aes(as.factor(BsmtQual)))+geom_histogram(stat='count')+labs(x='Height of the basement')
b7 <- ggplot(all,aes(as.factor(BsmtCond)))+geom_histogram(stat='count')+labs(x='Rating of general condition')
b8 <- ggplot(all,aes(as.factor(BsmtExposure)))+ geom_histogram(stat='count')+labs(x='Walkout or garden level walls')


layout<-matrix(c(1,2,3,4,5,9,6,7,8),3,3,byrow=TRUE)
Rmisc::multiplot(b1,b2,b3,b4,b5,b6,b7,b8,layout=layout)

##So it seemed as if the Total Basement Surface in square feet (TotalBsmtSF) is further broken down into finished areas (2 if more than one type of finish), and unfinished area. I did a check between the correlation of total of those 3 variables, and TotalBsmtSF. The correlation is exactely 1, so that’s a good thing (no errors or small discrepancies)!
#Basement Quality is a confusing variable name, as it turns out that it specifically rates the Height of the basement.

all[,c('TotalBsmtSF',"BsmtFinSF1","BsmtFinSF2","BsmtUnfSF")]
cor(all$TotalBsmtSF,all$BsmtFinSF1+all$BsmtFinSF2+all$BsmtUnfSF)
## [1] 1
##TotalBsmtSF=BsmtFinSF1+BsmtFinSF2+BsmtUnfSF
Feature engineering

Total number of Bathrooms

# There are 4 bathroom variables. Individually, these variables are not very important. However, I assume that I if I add them up into one predictor, this predictor is likely to become a strong one.
# “A half-bath, also known as a powder room or guest bath, has only two of the four main bathroom components-typically a toilet and sink.” Consequently, I will also count the half bathrooms as half.


all$TotBathrooms<-all$FullBath+(all$HalfBath*0.5)+all$BsmtFullBath+(all$BsmtHalfBath*0.5)

# As you can see in the first graph, there now seems to be a clear correlation (it’s 0.63). The frequency distribution of Bathrooms in all data is shown in the second graph.
tb1 <- ggplot(all[!is.na(all$SalePrice),],aes(as.factor(TotBathrooms),SalePrice))+
  geom_point(col='blue')+ geom_smooth(methods='lm',color='black',aes(group=1))+
#group=1????????
  scale_y_continuous(breaks=seq(0,800000,100000),labels=comma)

tb2 <- ggplot(all,aes(as.factor(TotBathrooms)))+
  geom_histogram(stat='count')

grid.arrange(tb1,tb2)

Adding ‘House Age’, ‘Remodeled (Yes/No)’, and IsNew variables

# Altogether, there are 3 variables that are relevant with regards to the Age of a house; YearBlt, YearRemodAdd, and YearSold. YearRemodAdd defaults to YearBuilt if there has been no Remodeling/Addition. I will use YearRemodeled and YearSold to determine the Age. However, as parts of old constructions will always remain and only parts of the house might have been renovated, I will also introduce a Remodeled Yes/No variable. This should be seen as some sort of penalty parameter that indicates that if the Age is based on a remodeling date, it is probably worth less than houses that were built from scratch in that same year.

all$Remod<- ifelse(all$YearBuilt==all$YearRemodAdd,0,1) #0=No Remodeling, 1=Remodeling
all$Age<- as.numeric(all$YrSold)-all$YearRemodAdd

ggplot(all[!is.na(all$SalePrice),],aes(Age,SalePrice))+
  geom_point(col='blue')+geom_smooth(method='lm',color='black',aes(group=1))+
  scale_y_continuous(breaks=seq(0,800000,100000),labels=comma)

# As expected, the graph shows a negative correlation with Age (old house are worth less).

cor(all$SalePrice[!is.na(all$SalePrice)],all$Age[!is.na(all$SalePrice)])
## [1] -0.5090787
# As you can see below, houses that are remodeled are worth less indeed, as expected.

ggplot(all[!is.na(SalePrice),],aes(as.factor(Remod),SalePrice))+
  geom_bar(stat='summary',fun.y='median',fill='blue')+
  geom_label(stat='count',aes(label=..count..,y=..count..),size=6)+
  theme_grey(base_size = 18)+
  geom_hline(yintercept=163000,linetype='dashed') #dashed line is median SalePrice

# Finally, I am creating the IsNew variable below. Altogether, there are 116 new houses in the dataset.
all$IsNew<- ifelse(all$YrSold==all$YearBuilt,1,0)
table(all$IsNew)
## 
##    0    1 
## 2803  116
# These 116 new houses are fairly evenly distributed among train and test set, and as you can see new houses are worth considerably more on average.

ggplot(all[!is.na(all$SalePrice),],aes(as.factor(IsNew),SalePrice))+
  geom_bar(stat='summary',fun.y='median',fill='blue')+
  geom_label(stat='count',aes(label=..count..,y=..count..),size=6)+
  scale_y_continuous(breaks=seq(0,800000,50000),labels=comma)+
  theme_grey(base_size=18)+
  geom_hline(yintercept=163000,linetype='dashed') #dashed line is median SalePrice

all$YrSold <- as.factor(all$YrSold) #the numeric version is now not needed anymore

Binning Neighborhood

nb1<- ggplot(all[!is.na(all$SalePrice),],aes(reorder(Neighborhood,SalePrice,FUN=median),SalePrice))+
  geom_bar(stat='summary',fun.y='median',fill='blue')+labs(x='Neighborhood',y='Medain SalePrice')+
  theme(axis.text.x=element_text(angle=45,hjust=1))+
  scale_y_continuous(breaks=seq(0,800000,50000),labels=comma)+
  geom_hline(yintercept = 163000,linetype='dashed',color='red')+
  geom_label(stat='count',aes(label=..count..,y=..count..),size=3)

nb2<- ggplot(all[!is.na(all$SalePrice),],aes(reorder(Neighborhood,SalePrice,FUN = mean),SalePrice))+
  geom_bar(stat='summary', fun.y='mean', fill='blue')+labs(x='Neighborhood',y='Mean SalePrice')+
  scale_y_continuous(breaks=seq(0,800000,50000),labels=comma)+
  geom_label(stat='count',aes(label=..count..,y=..count..),size=3)+
  theme(axis.text.x=element_text(angle=45,hjust=1))+
  geom_hline(yintercept = 163000,linetype='dashed',color='red') #dashed line is median SalePric

grid.arrange(nb1,nb2)

# Both the median and mean Saleprices agree on 3 neighborhoods with substantially higher saleprices. The separation of the 3 relatively poor neighborhoods is less clear, but at least both graphs agree on the same 3 poor neighborhoods. Since I do not want to ‘overbin’, I am only creating categories for those ‘extremes’.

all$NeighRich[all$Neighborhood%in%c('StoneBr', 'NridgHt', 'NoRidge')]<-2
all$NeighRich[!all$Neighborhood%in%c('MeadowV', 'IDOTRR', 'BrDale', 'StoneBr', 'NridgHt', 'NoRidge')]<-1
all$NeighRich[all$Neighborhood%in%c('MeadowV', 'IDOTRR', 'BrDale')]<-0
sum(table(all$NeighRich))
## [1] 2919

Total Square Feet

# As the total living space generally is very important when people buy houses, I am adding a predictors that adds up the living space above and below ground.

all$TotalSqFeet<-all$GrLivArea+all$TotalBsmtSF

ggplot(all[!is.na(all$SalePrice),],aes(TotalSqFeet,SalePrice))+
  geom_point(col='blue',alpha=0.3,position='jitter')+geom_smooth(method='lm',color='black',aes(group=1))+
  scale_y_continuous(breaks=seq(0,800000,100000),labels=comma)+
  geom_text_repel(aes(label= ifelse(all$GrLivArea[!is.na(all$SalePrice)]>4500,rownames(all),'')))

# As expected, the correlation with SalePrice is very strong indeed (0.78).
cor(all$TotalSqFeet,all$SalePrice,use='pairwise.complete.obs')
## [1] 0.7789588
##上面這行等於這行cor(all$TotalSqFeet[!is.na(SalePrice)],all$SalePrice[!is.na(SalePrice)]),所以use的功用在於排除NA

#The two potential outliers seem to ‘outlie’ even more than before. By taking out these two outliers, the correlation increases by 5%.

cor(all$TotalSqFeet[-c(524,1299)],all$SalePrice[-c(524,1299)],use='pairwise.complete.obs')
## [1] 0.829042
Consolidating Porch variables
# Below, I listed the variables that seem related regarding porches.
# WoodDeckSF: Wood deck area in square feet
# OpenPorchSF: Open porch area in square feet
# EnclosedPorch: Enclosed porch area in square feet
# 3SsnPorch: Three season porch area in square feet
# ScreenPorch: Screen porch area in square feet
# As far as I know, porches are sheltered areas outside of the house, and a wooden deck is unsheltered. Therefore, I am leaving WoodDeckSF alone, and are only consolidating the 4 porch variables.

all$TotalPorchSF<-all$OpenPorchSF+all$EnclosedPorch+all$`3SsnPorch`+all$ScreenPorch
# Although adding up these Porch areas makes sense (there should not be any overlap between areas), the correlation with SalePrice is not very strong.
cor(all$TotalPorchSF,all$SalePrice,use='pairwise.complete.obs')
## [1] 0.1957389
ggplot(all[!is.na(all$SalePrice),],aes(TotalPorchSF,SalePrice))+
  geom_point(col='blue',alpha=0.3)+
  geom_smooth(method='lm',color='black',aes(group=1))+
  scale_y_continuous(breaks=seq(0,800000,100000),labels=comma)

Preparing data for modeling

Dropping highly correlated variables

# First of all, I am dropping a variable if two variables are highly correlated. To find these correlated pairs, I have used the correlations matrix again (see section 6.1). For instance: GarageCars and GarageArea have a correlation of 0.89. Of those two, I am dropping the variable with the lowest correlation with SalePrice (which is GarageArea with a SalePrice correlation of 0.62. GarageCars has a SalePrice correlation of 0.64).

dropVars<- c('YearRemodAdd', 'GarageYrBlt', 'GarageArea', 'GarageCond', 'TotalBsmtSF', 'TotalRmsAbvGrd', 'BsmtFinSF1')
all<-all[,!(names(all)%in%dropVars)]

Removing outliers

all<-all[-c(524,1299),]

PreProcessing predictor variables

#Before modeling I need to center and scale the ‘true numeric’ predictors (so not variables that have been label encoded), and create dummy variables for the categorical predictors. Below, I am splitting the dataframe into one with all (true) numeric variables, and another dataframe holding the (ordinal) factors.


numericVarNames <- numericVarNames[!(numericVarNames%in%c('MSSubClass', 'MoSold', 'YrSold', 'SalePrice', 'OverallQual', 'OverallCond'))] #numericVarNames was created before having done anything
numericVarNames<-append(numericVarNames,c('Age', 'TotalPorchSF', 'TotBathrooms', 'TotalSqFeet'))  

DFnumeric<-all[,names(all)%in%numericVarNames]
DFfactors<-all[,!(names(all)%in%numericVarNames)]
DFfactors<-DFfactors[,names(DFfactors)!='SalePrice']


cat('There are', length(DFnumeric), 'numeric variables, and', length(DFfactors), 'factor variables')
## There are 30 numeric variables, and 49 factor variables

Skewness and normalizing of the numeric predictors